mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
💫 New JSON helpers, training data internals & CLI rewrite (#2932)
* Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
This commit is contained in:
parent
0369db75c1
commit
37c7c85a86
|
@ -11,6 +11,8 @@ ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
regex==2018.01.10
|
regex==2018.01.10
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
|
jsonschema>=2.6.0,<3.0.0
|
||||||
|
wasabi>=0.0.8,<1.1.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
pytest>=4.0.0,<5.0.0
|
pytest>=4.0.0,<5.0.0
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -207,6 +207,8 @@ def setup_package():
|
||||||
"regex==2018.01.10",
|
"regex==2018.01.10",
|
||||||
"dill>=0.2,<0.3",
|
"dill>=0.2,<0.3",
|
||||||
"requests>=2.13.0,<3.0.0",
|
"requests>=2.13.0,<3.0.0",
|
||||||
|
"jsonschema>=2.6.0,<3.0.0",
|
||||||
|
"wasabi>=0.0.8,<1.1.0",
|
||||||
'pathlib==1.0.1; python_version < "3.4"',
|
'pathlib==1.0.1; python_version < "3.4"',
|
||||||
],
|
],
|
||||||
setup_requires=["wheel"],
|
setup_requires=["wheel"],
|
||||||
|
|
|
@ -1,40 +1,41 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
# NB! This breaks in plac on Python 2!!
|
# NB! This breaks in plac on Python 2!!
|
||||||
# from __future__ import unicode_literals
|
# from __future__ import unicode_literals
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
from wasabi import Printer
|
||||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||||
from spacy.cli import vocab, init_model, profile, evaluate, validate
|
from spacy.cli import init_model, profile, evaluate, validate
|
||||||
from spacy.cli import ud_train, ud_evaluate
|
from spacy.cli import ud_train, ud_evaluate, debug_data
|
||||||
from spacy.util import prints
|
|
||||||
|
msg = Printer()
|
||||||
|
|
||||||
commands = {
|
commands = {
|
||||||
'download': download,
|
"download": download,
|
||||||
'link': link,
|
"link": link,
|
||||||
'info': info,
|
"info": info,
|
||||||
'train': train,
|
"train": train,
|
||||||
'pretrain': pretrain,
|
"pretrain": pretrain,
|
||||||
'ud-train': ud_train,
|
"debug-data": debug_data,
|
||||||
'evaluate': evaluate,
|
"ud-train": ud_train,
|
||||||
'ud-evaluate': ud_evaluate,
|
"evaluate": evaluate,
|
||||||
'convert': convert,
|
"ud-evaluate": ud_evaluate,
|
||||||
'package': package,
|
"convert": convert,
|
||||||
'vocab': vocab,
|
"package": package,
|
||||||
'init-model': init_model,
|
"init-model": init_model,
|
||||||
'profile': profile,
|
"profile": profile,
|
||||||
'validate': validate
|
"validate": validate,
|
||||||
}
|
}
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
prints(', '.join(commands), title="Available commands", exits=1)
|
msg.info("Available commands", ", ".join(commands), exits=1)
|
||||||
command = sys.argv.pop(1)
|
command = sys.argv.pop(1)
|
||||||
sys.argv[0] = 'spacy %s' % command
|
sys.argv[0] = "spacy %s" % command
|
||||||
if command in commands:
|
if command in commands:
|
||||||
plac.call(commands[command], sys.argv[1:])
|
plac.call(commands[command], sys.argv[1:])
|
||||||
else:
|
else:
|
||||||
prints(
|
available = "Available: {}".format(", ".join(commands))
|
||||||
"Available: %s" % ', '.join(commands),
|
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||||
title="Unknown command: %s" % command,
|
|
||||||
exits=1)
|
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
from .download import download
|
from .download import download # noqa: F401
|
||||||
from .info import info
|
from .info import info # noqa: F401
|
||||||
from .link import link
|
from .link import link # noqa: F401
|
||||||
from .package import package
|
from .package import package # noqa: F401
|
||||||
from .profile import profile
|
from .profile import profile # noqa: F401
|
||||||
from .train import train
|
from .train import train # noqa: F401
|
||||||
from .pretrain import pretrain
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .evaluate import evaluate
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .convert import convert
|
from .evaluate import evaluate # noqa: F401
|
||||||
from .vocab import make_vocab as vocab
|
from .convert import convert # noqa: F401
|
||||||
from .init_model import init_model
|
from .init_model import init_model # noqa: F401
|
||||||
from .validate import validate
|
from .validate import validate # noqa: F401
|
||||||
from .ud_train import main as ud_train
|
from .ud import ud_train, ud_evaluate # noqa: F401
|
||||||
from .conll17_ud_eval import main as ud_evaluate
|
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
|
||||||
class Messages(object):
|
class Messages(object):
|
||||||
M001 = ("Download successful but linking failed")
|
M001 = ("Download successful but linking failed")
|
||||||
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||||
|
@ -73,3 +75,31 @@ class Messages(object):
|
||||||
M052 = ("Not a valid meta.json format")
|
M052 = ("Not a valid meta.json format")
|
||||||
M053 = ("Expected dict but got: {meta_type}")
|
M053 = ("Expected dict but got: {meta_type}")
|
||||||
M054 = ("No --lang specified, but tokenization required.")
|
M054 = ("No --lang specified, but tokenization required.")
|
||||||
|
M055 = ("Training pipeline: {pipeline}")
|
||||||
|
M056 = ("Starting with base model '{model}'")
|
||||||
|
M057 = ("Starting with blank model '{model}'")
|
||||||
|
M058 = ("Loading vector from model '{model}'")
|
||||||
|
M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
|
||||||
|
M060 = ("Counting training words (limit={limit})")
|
||||||
|
M061 = ("\nSaving model...")
|
||||||
|
M062 = ("Output directory is not empty.")
|
||||||
|
M063 = ("Incompatible arguments")
|
||||||
|
M064 = ("The -f and -c arguments are deprecated, and not compatible with "
|
||||||
|
"the -j argument, which should specify the same information. "
|
||||||
|
"Either merge the frequencies and clusters data into the "
|
||||||
|
"JSONL-formatted file (recommended), or use only the -f and -c "
|
||||||
|
"files, without the other lexical attributes.")
|
||||||
|
M065 = ("This can lead to unintended side effects when saving the model. "
|
||||||
|
"Please use an empty directory or a different path instead. If "
|
||||||
|
"the specified output path doesn't exist, the directory will be "
|
||||||
|
"created for you.")
|
||||||
|
M066 = ("Saved model to output directory")
|
||||||
|
M067 = ("Can't find lexical data")
|
||||||
|
M068 = ("Sucessfully compiled vocab and vectors, and saved model")
|
||||||
|
M069 = ("Unknown file type: '{name}'")
|
||||||
|
M070 = ("Supported file types: '{options}'")
|
||||||
|
M071 = ("Loaded pretrained tok2vec for: {components}")
|
||||||
|
M072 = ("Model language ('{model_lang}') doesn't match language specified "
|
||||||
|
"as `lang` argument ('{lang}') ")
|
||||||
|
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -3,49 +3,91 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ..util import write_jsonl, write_json
|
||||||
|
from ..compat import json_dumps, path2str
|
||||||
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
|
||||||
from .converters import ner_jsonl2json
|
from .converters import ner_jsonl2json
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..util import prints
|
|
||||||
|
|
||||||
# Converters are matched by file extension. To add a converter, add a new
|
# Converters are matched by file extension. To add a converter, add a new
|
||||||
# entry to this dict with the file extension mapped to the converter function
|
# entry to this dict with the file extension mapped to the converter function
|
||||||
# imported from /converters.
|
# imported from /converters.
|
||||||
CONVERTERS = {
|
CONVERTERS = {
|
||||||
'conllubio': conllubio2json,
|
"conllubio": conllubio2json,
|
||||||
'conllu': conllu2json,
|
"conllu": conllu2json,
|
||||||
'conll': conllu2json,
|
"conll": conllu2json,
|
||||||
'ner': conll_ner2json,
|
"ner": conll_ner2json,
|
||||||
'iob': iob2json,
|
"iob": iob2json,
|
||||||
'jsonl': ner_jsonl2json
|
"jsonl": ner_jsonl2json,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# File types
|
||||||
|
FILE_TYPES = ("json", "jsonl")
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_file=("input file", "positional", None, str),
|
input_file=("Input file", "positional", None, str),
|
||||||
output_dir=("output directory for converted file", "positional", None, str),
|
output_dir=("Output directory for converted file", "positional", None, str),
|
||||||
|
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool))
|
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||||
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto',
|
)
|
||||||
lang=None):
|
def convert(
|
||||||
|
input_file,
|
||||||
|
output_dir="-",
|
||||||
|
file_type="jsonl",
|
||||||
|
n_sents=1,
|
||||||
|
morphology=False,
|
||||||
|
converter="auto",
|
||||||
|
lang=None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions. If no output_dir is specified, the data
|
||||||
|
is written to stdout, so you can pipe them forward to a JSONL file:
|
||||||
|
$ spacy convert some_file.conllu > some_file.jsonl
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
input_path = Path(input_file)
|
input_path = Path(input_file)
|
||||||
output_path = Path(output_dir)
|
if file_type not in FILE_TYPES:
|
||||||
|
msg.fail(
|
||||||
|
Messages.M069.format(name=file_type),
|
||||||
|
Messages.M070.format(options=", ".join(FILE_TYPES)),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
prints(input_path, title=Messages.M028, exits=1)
|
msg.fail(Messages.M028, input_path, exits=1)
|
||||||
if not output_path.exists():
|
if output_dir != "-" and not Path(output_dir).exists():
|
||||||
prints(output_path, title=Messages.M029, exits=1)
|
msg.fail(Messages.M029, output_dir, exits=1)
|
||||||
if converter == 'auto':
|
if converter == "auto":
|
||||||
converter = input_path.suffix[1:]
|
converter = input_path.suffix[1:]
|
||||||
if converter not in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
prints(Messages.M031.format(converter=converter),
|
msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
|
||||||
title=Messages.M030, exits=1)
|
# Use converter function to convert data
|
||||||
func = CONVERTERS[converter]
|
func = CONVERTERS[converter]
|
||||||
func(input_path, output_path,
|
input_data = input_path.open("r", encoding="utf-8").read()
|
||||||
n_sents=n_sents, use_morphology=morphology, lang=lang)
|
data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
|
||||||
|
if output_dir != "-":
|
||||||
|
# Export data to a file
|
||||||
|
suffix = ".{}".format(file_type)
|
||||||
|
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||||
|
if file_type == "json":
|
||||||
|
write_json(output_file, data)
|
||||||
|
elif file_type == "jsonl":
|
||||||
|
write_jsonl(output_file, data)
|
||||||
|
msg.good(
|
||||||
|
Messages.M032.format(name=path2str(output_file)),
|
||||||
|
Messages.M033.format(n_docs=len(data)),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Print to stdout
|
||||||
|
if file_type == "json":
|
||||||
|
print(json_dumps(data))
|
||||||
|
elif file_type == "jsonl":
|
||||||
|
for line in data:
|
||||||
|
print(json_dumps(line))
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from .conllu2json import conllu2json
|
from .conllu2json import conllu2json # noqa: F401
|
||||||
from .conllubio2json import conllubio2json
|
from .conllubio2json import conllubio2json # noqa: F401
|
||||||
from .iob2json import iob2json
|
from .iob2json import iob2json # noqa: F401
|
||||||
from .conll_ner2json import conll_ner2json
|
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||||
from .jsonl2json import ner_jsonl2json
|
from .jsonl2json import ner_jsonl2json # noqa: F401
|
||||||
|
|
|
@ -1,52 +1,38 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .._messages import Messages
|
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
def conll_ner2json(input_data, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
Convert files in the CoNLL-2003 NER format into JSON format for use with
|
||||||
train cli.
|
train cli.
|
||||||
"""
|
"""
|
||||||
docs = read_conll_ner(input_path)
|
delimit_docs = "-DOCSTART- -X- O O"
|
||||||
|
|
||||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
|
||||||
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
|
|
||||||
output_file = output_path / output_filename
|
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
|
||||||
f.write(json_dumps(docs))
|
|
||||||
prints(Messages.M033.format(n_docs=len(docs)),
|
|
||||||
title=Messages.M032.format(name=path2str(output_file)))
|
|
||||||
|
|
||||||
|
|
||||||
def read_conll_ner(input_path):
|
|
||||||
text = input_path.open('r', encoding='utf-8').read()
|
|
||||||
i = 0
|
|
||||||
delimit_docs = '-DOCSTART- -X- O O'
|
|
||||||
output_docs = []
|
output_docs = []
|
||||||
for doc in text.strip().split(delimit_docs):
|
for doc in input_data.strip().split(delimit_docs):
|
||||||
doc = doc.strip()
|
doc = doc.strip()
|
||||||
if not doc:
|
if not doc:
|
||||||
continue
|
continue
|
||||||
output_doc = []
|
output_doc = []
|
||||||
for sent in doc.split('\n\n'):
|
for sent in doc.split("\n\n"):
|
||||||
sent = sent.strip()
|
sent = sent.strip()
|
||||||
if not sent:
|
if not sent:
|
||||||
continue
|
continue
|
||||||
lines = [line.strip() for line in sent.split('\n') if line.strip()]
|
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||||
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
|
||||||
biluo_ents = iob_to_biluo(iob_ents)
|
biluo_ents = iob_to_biluo(iob_ents)
|
||||||
output_doc.append({'tokens': [
|
output_doc.append(
|
||||||
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
|
{
|
||||||
zip(words, tags, biluo_ents)
|
"tokens": [
|
||||||
]})
|
{"orth": w, "tag": tag, "ner": ent}
|
||||||
output_docs.append({
|
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
||||||
'id': len(output_docs),
|
]
|
||||||
'paragraphs': [{'sentences': output_doc}]
|
}
|
||||||
})
|
)
|
||||||
|
output_docs.append(
|
||||||
|
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
||||||
|
)
|
||||||
output_doc = []
|
output_doc = []
|
||||||
return output_docs
|
return output_docs
|
||||||
|
|
|
@ -1,34 +1,27 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .._messages import Messages
|
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
|
||||||
|
|
||||||
|
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
useful for languages such as Spanish, where UD tags are not so rich.
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
"""
|
|
||||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
|
||||||
|
|
||||||
"""
|
|
||||||
Extract NER tags if available and convert them so that they follow
|
Extract NER tags if available and convert them so that they follow
|
||||||
BILUO and the Wikipedia scheme
|
BILUO and the Wikipedia scheme
|
||||||
"""
|
"""
|
||||||
|
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||||
# by @katarkor
|
# by @katarkor
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
sentences = []
|
sentences = []
|
||||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||||
checked_for_ner = False
|
checked_for_ner = False
|
||||||
has_ner_tags = False
|
has_ner_tags = False
|
||||||
|
|
||||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||||
sentence, brackets = tokens[0]
|
sentence, brackets = tokens[0]
|
||||||
if not checked_for_ner:
|
if not checked_for_ner:
|
||||||
|
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
|
||||||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
sentences.append(generate_sentence(sentence, has_ner_tags))
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# conluu document
|
# conluu document
|
||||||
|
if len(sentences) % n_sents == 0:
|
||||||
if(len(sentences) % n_sents == 0):
|
|
||||||
doc = create_doc(sentences, i)
|
doc = create_doc(sentences, i)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
sentences = []
|
sentences = []
|
||||||
|
return docs
|
||||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
|
||||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
|
||||||
output_file = output_path / output_filename
|
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
|
||||||
f.write(json_dumps(docs))
|
|
||||||
prints(Messages.M033.format(n_docs=len(docs)),
|
|
||||||
title=Messages.M032.format(name=path2str(output_file)))
|
|
||||||
|
|
||||||
|
|
||||||
def is_ner(tag):
|
def is_ner(tag):
|
||||||
|
|
||||||
"""
|
|
||||||
Check the 10th column of the first token to determine if the file contains
|
|
||||||
NER tags
|
|
||||||
"""
|
"""
|
||||||
|
Check the 10th column of the first token to determine if the file contains
|
||||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
NER tags
|
||||||
|
"""
|
||||||
|
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||||
if tag_match:
|
if tag_match:
|
||||||
return True
|
return True
|
||||||
elif tag == "O":
|
elif tag == "O":
|
||||||
|
@ -67,29 +50,29 @@ def is_ner(tag):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def read_conllx(input_path, use_morphology=False, n=0):
|
|
||||||
text = input_path.open('r', encoding='utf-8').read()
|
def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
i = 0
|
i = 0
|
||||||
for sent in text.strip().split('\n\n'):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split('\n')
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith('#'):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
tokens = []
|
tokens = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
||||||
parts = line.split('\t')
|
parts = line.split("\t")
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||||
if '-' in id_ or '.' in id_:
|
if "-" in id_ or "." in id_:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
id_ = int(id_) - 1
|
id_ = int(id_) - 1
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
head = (int(head) - 1) if head != "0" else id_
|
||||||
dep = 'ROOT' if dep == 'root' else dep
|
dep = "ROOT" if dep == "root" else dep
|
||||||
tag = pos if tag == '_' else tag
|
tag = pos if tag == "_" else tag
|
||||||
tag = tag+'__'+morph if use_morphology else tag
|
tag = tag + "__" + morph if use_morphology else tag
|
||||||
tokens.append((id_, word, tag, head, dep, iob))
|
tokens.append((id_, word, tag, head, dep, iob))
|
||||||
except:
|
except: # noqa: E722
|
||||||
print(line)
|
print(line)
|
||||||
raise
|
raise
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
tuples = [list(t) for t in zip(*tokens)]
|
||||||
|
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
if n >= 1 and i >= n:
|
if n >= 1 and i >= n:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def simplify_tags(iob):
|
def simplify_tags(iob):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||||
'MISC'.
|
'MISC'.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
new_iob = []
|
new_iob = []
|
||||||
for tag in iob:
|
for tag in iob:
|
||||||
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
|
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||||
if tag_match:
|
if tag_match:
|
||||||
prefix = tag_match.group(1)
|
prefix = tag_match.group(1)
|
||||||
suffix = tag_match.group(2)
|
suffix = tag_match.group(2)
|
||||||
if suffix == 'GPE_LOC':
|
if suffix == "GPE_LOC":
|
||||||
suffix = 'LOC'
|
suffix = "LOC"
|
||||||
elif suffix == 'GPE_ORG':
|
elif suffix == "GPE_ORG":
|
||||||
suffix = 'ORG'
|
suffix = "ORG"
|
||||||
elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG':
|
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||||
suffix = 'MISC'
|
suffix = "MISC"
|
||||||
tag = prefix + '-' + suffix
|
tag = prefix + "-" + suffix
|
||||||
new_iob.append(tag)
|
new_iob.append(tag)
|
||||||
return new_iob
|
return new_iob
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(sent, has_ner_tags):
|
def generate_sentence(sent, has_ner_tags):
|
||||||
(id_, word, tag, head, dep, iob) = sent
|
(id_, word, tag, head, dep, iob) = sent
|
||||||
sentence = {}
|
sentence = {}
|
||||||
|
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
def create_doc(sentences,id):
|
def create_doc(sentences, id):
|
||||||
doc = {}
|
doc = {}
|
||||||
paragraph = {}
|
paragraph = {}
|
||||||
doc["id"] = id
|
doc["id"] = id
|
||||||
|
|
|
@ -1,65 +1,54 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
|
|
||||||
|
def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
|
||||||
"""
|
"""
|
||||||
Convert conllu files into JSON format for use with train cli.
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
useful for languages such as Spanish, where UD tags are not so rich.
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
"""
|
"""
|
||||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||||
|
|
||||||
docs = []
|
docs = []
|
||||||
sentences = []
|
sentences = []
|
||||||
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||||
|
|
||||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||||
sentence, brackets = tokens[0]
|
sentence, brackets = tokens[0]
|
||||||
sentences.append(generate_sentence(sentence))
|
sentences.append(generate_sentence(sentence))
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# conluu document
|
# conluu document
|
||||||
if(len(sentences) % n_sents == 0):
|
if len(sentences) % n_sents == 0:
|
||||||
doc = create_doc(sentences, i)
|
doc = create_doc(sentences, i)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
sentences = []
|
sentences = []
|
||||||
|
return docs
|
||||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
|
||||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
|
||||||
output_file = output_path / output_filename
|
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
|
||||||
f.write(json_dumps(docs))
|
|
||||||
prints("Created %d documents" % len(docs),
|
|
||||||
title="Generated output file %s" % path2str(output_file))
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(input_path, use_morphology=False, n=0):
|
def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
text = input_path.open('r', encoding='utf-8').read()
|
|
||||||
i = 0
|
i = 0
|
||||||
for sent in text.strip().split('\n\n'):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split('\n')
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith('#'):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
tokens = []
|
tokens = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
||||||
parts = line.split('\t')
|
parts = line.split("\t")
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
|
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
|
||||||
if '-' in id_ or '.' in id_:
|
if "-" in id_ or "." in id_:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
id_ = int(id_) - 1
|
id_ = int(id_) - 1
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
head = (int(head) - 1) if head != "0" else id_
|
||||||
dep = 'ROOT' if dep == 'root' else dep
|
dep = "ROOT" if dep == "root" else dep
|
||||||
tag = pos if tag == '_' else tag
|
tag = pos if tag == "_" else tag
|
||||||
tag = tag+'__'+morph if use_morphology else tag
|
tag = tag + "__" + morph if use_morphology else tag
|
||||||
ner = ner if ner else 'O'
|
ner = ner if ner else "O"
|
||||||
tokens.append((id_, word, tag, head, dep, ner))
|
tokens.append((id_, word, tag, head, dep, ner))
|
||||||
except:
|
except: # noqa: E722
|
||||||
print(line)
|
print(line)
|
||||||
raise
|
raise
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
tuples = [list(t) for t in zip(*tokens)]
|
||||||
|
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
if n >= 1 and i >= n:
|
if n >= 1 and i >= n:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(sent):
|
def generate_sentence(sent):
|
||||||
(id_, word, tag, head, dep, ner) = sent
|
(id_, word, tag, head, dep, ner) = sent
|
||||||
sentence = {}
|
sentence = {}
|
||||||
|
@ -85,7 +75,7 @@ def generate_sentence(sent):
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
def create_doc(sentences,id):
|
def create_doc(sentences, id):
|
||||||
doc = {}
|
doc = {}
|
||||||
paragraph = {}
|
paragraph = {}
|
||||||
doc["id"] = id
|
doc["id"] = id
|
||||||
|
|
|
@ -1,26 +1,24 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from cytoolz import partition_all, concat
|
|
||||||
|
|
||||||
from .._messages import Messages
|
from cytoolz import partition_all
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Convert IOB files into JSON format for use with train cli.
|
Convert IOB files into JSON format for use with train cli.
|
||||||
"""
|
"""
|
||||||
with input_path.open('r', encoding='utf8') as file_:
|
docs = []
|
||||||
sentences = read_iob(file_)
|
for group in partition_all(n_sents, docs):
|
||||||
docs = merge_sentences(sentences, n_sents)
|
group = list(group)
|
||||||
output_filename = input_path.parts[-1].replace(".iob", ".json")
|
first = group.pop(0)
|
||||||
output_file = output_path / output_filename
|
to_extend = first["paragraphs"][0]["sentences"]
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
for sent in group[1:]:
|
||||||
f.write(json_dumps(docs))
|
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||||
prints(Messages.M033.format(n_docs=len(docs)),
|
docs.append(first)
|
||||||
title=Messages.M032.format(name=path2str(output_file)))
|
return docs
|
||||||
|
|
||||||
|
|
||||||
def read_iob(raw_sents):
|
def read_iob(raw_sents):
|
||||||
|
@ -28,30 +26,20 @@ def read_iob(raw_sents):
|
||||||
for line in raw_sents:
|
for line in raw_sents:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
tokens = [t.split('|') for t in line.split()]
|
tokens = [t.split("|") for t in line.split()]
|
||||||
if len(tokens[0]) == 3:
|
if len(tokens[0]) == 3:
|
||||||
words, pos, iob = zip(*tokens)
|
words, pos, iob = zip(*tokens)
|
||||||
else:
|
else:
|
||||||
words, iob = zip(*tokens)
|
words, iob = zip(*tokens)
|
||||||
pos = ['-'] * len(words)
|
pos = ["-"] * len(words)
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
sentences.append([
|
sentences.append(
|
||||||
{'orth': w, 'tag': p, 'ner': ent}
|
[
|
||||||
for (w, p, ent) in zip(words, pos, biluo)
|
{"orth": w, "tag": p, "ner": ent}
|
||||||
])
|
for (w, p, ent) in zip(words, pos, biluo)
|
||||||
sentences = [{'tokens': sent} for sent in sentences]
|
]
|
||||||
paragraphs = [{'sentences': [sent]} for sent in sentences]
|
)
|
||||||
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
|
sentences = [{"tokens": sent} for sent in sentences]
|
||||||
|
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||||
|
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
def merge_sentences(docs, n_sents):
|
|
||||||
counter = 0
|
|
||||||
merged = []
|
|
||||||
for group in partition_all(n_sents, docs):
|
|
||||||
group = list(group)
|
|
||||||
first = group.pop(0)
|
|
||||||
to_extend = first['paragraphs'][0]['sentences']
|
|
||||||
for sent in group[1:]:
|
|
||||||
to_extend.extend(sent['paragraphs'][0]['sentences'])
|
|
||||||
merged.append(first)
|
|
||||||
return merged
|
|
||||||
|
|
|
@ -1,33 +1,21 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import ujson as json
|
|
||||||
|
|
||||||
|
import ujson
|
||||||
|
|
||||||
|
from ...util import get_lang_class
|
||||||
from .._messages import Messages
|
from .._messages import Messages
|
||||||
from ...compat import json_dumps, path2str
|
|
||||||
from ...util import prints, get_lang_class
|
|
||||||
from ...gold import docs_to_json
|
|
||||||
|
|
||||||
|
|
||||||
def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False):
|
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
|
||||||
if lang is None:
|
if lang is None:
|
||||||
prints(Messages.M054, exits=True)
|
raise ValueError(Messages.M054)
|
||||||
json_docs = []
|
json_docs = []
|
||||||
input_tuples = list(read_jsonl(input_path))
|
input_tuples = [ujson.loads(line) for line in input_data]
|
||||||
nlp = get_lang_class(lang)()
|
nlp = get_lang_class(lang)()
|
||||||
for i, (raw_text, ents) in enumerate(input_tuples):
|
for i, (raw_text, ents) in enumerate(input_tuples):
|
||||||
doc = nlp.make_doc(raw_text)
|
doc = nlp.make_doc(raw_text)
|
||||||
doc[0].is_sent_start = True
|
doc[0].is_sent_start = True
|
||||||
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']]
|
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
|
||||||
json_docs.append(docs_to_json(i, [doc]))
|
json_docs.append(doc.to_json())
|
||||||
|
return json_docs
|
||||||
output_filename = input_path.parts[-1].replace(".jsonl", ".json")
|
|
||||||
output_loc = output_path / output_filename
|
|
||||||
with (output_loc).open('w', encoding='utf8') as file_:
|
|
||||||
file_.write(json_dumps(json_docs))
|
|
||||||
prints(Messages.M033.format(n_docs=len(json_docs)),
|
|
||||||
title=Messages.M032.format(name=path2str(output_loc)))
|
|
||||||
|
|
||||||
def read_jsonl(input_path):
|
|
||||||
with input_path.open('r', encoding='utf8') as file_:
|
|
||||||
for line in file_:
|
|
||||||
yield json.loads(line)
|
|
||||||
|
|
398
spacy/cli/debug_data.py
Normal file
398
spacy/cli/debug_data.py
Normal file
|
@ -0,0 +1,398 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import Counter
|
||||||
|
import plac
|
||||||
|
import sys
|
||||||
|
from wasabi import Printer, MESSAGES
|
||||||
|
|
||||||
|
from ..gold import GoldCorpus, read_json_object
|
||||||
|
from ..util import load_model, get_lang_class, read_json, read_jsonl
|
||||||
|
|
||||||
|
# from .schemas import get_schema, validate_json
|
||||||
|
from ._messages import Messages
|
||||||
|
|
||||||
|
|
||||||
|
# Minimum number of expected occurences of label in data to train new label
|
||||||
|
NEW_LABEL_THRESHOLD = 50
|
||||||
|
# Minimum number of expected examples to train a blank model
|
||||||
|
BLANK_MODEL_MIN_THRESHOLD = 100
|
||||||
|
BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("model language", "positional", None, str),
|
||||||
|
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||||
|
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||||
|
base_model=("name of model to update (optional)", "option", "b", str),
|
||||||
|
pipeline=(
|
||||||
|
"Comma-separated names of pipeline components to train",
|
||||||
|
"option",
|
||||||
|
"p",
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||||
|
ignore_validation=(
|
||||||
|
"Don't exit if JSON format validation fails",
|
||||||
|
"flag",
|
||||||
|
"IV",
|
||||||
|
bool,
|
||||||
|
),
|
||||||
|
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||||
|
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||||
|
)
|
||||||
|
def debug_data(
|
||||||
|
lang,
|
||||||
|
train_path,
|
||||||
|
dev_path,
|
||||||
|
base_model=None,
|
||||||
|
pipeline="tagger,parser,ner",
|
||||||
|
ignore_warnings=False,
|
||||||
|
ignore_validation=False,
|
||||||
|
verbose=False,
|
||||||
|
no_format=False,
|
||||||
|
):
|
||||||
|
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
|
||||||
|
|
||||||
|
# Make sure all files and paths exists if they are needed
|
||||||
|
if not train_path.exists():
|
||||||
|
msg.fail(Messages.M050, train_path, exits=1)
|
||||||
|
if not dev_path.exists():
|
||||||
|
msg.fail(Messages.M051, dev_path, exits=1)
|
||||||
|
|
||||||
|
# Initialize the model and pipeline
|
||||||
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
|
if base_model:
|
||||||
|
nlp = load_model(base_model)
|
||||||
|
else:
|
||||||
|
lang_cls = get_lang_class(lang)
|
||||||
|
nlp = lang_cls()
|
||||||
|
|
||||||
|
msg.divider("Data format validation")
|
||||||
|
# Load the data in one – might take a while but okay in this case
|
||||||
|
with msg.loading("Loading {}...".format(train_path.parts[-1])):
|
||||||
|
train_data = _load_file(train_path, msg)
|
||||||
|
with msg.loading("Loading {}...".format(dev_path.parts[-1])):
|
||||||
|
dev_data = _load_file(dev_path, msg)
|
||||||
|
|
||||||
|
# Validate data format using the JSON schema
|
||||||
|
# TODO: update once the new format is ready
|
||||||
|
# schema = get_schema("training")
|
||||||
|
train_data_errors = [] # TODO: validate_json(train_data, schema)
|
||||||
|
dev_data_errors = [] # TODO: validate_json(dev_data, schema)
|
||||||
|
if not train_data_errors:
|
||||||
|
msg.good("Training data JSON format is valid")
|
||||||
|
if not dev_data_errors:
|
||||||
|
msg.good("Development data JSON format is valid")
|
||||||
|
for error in train_data_errors:
|
||||||
|
msg.fail("Training data: {}".format(error))
|
||||||
|
for error in dev_data_errors:
|
||||||
|
msg.fail("Develoment data: {}".format(error))
|
||||||
|
if (train_data_errors or dev_data_errors) and not ignore_validation:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Create the gold corpus to be able to better analyze data
|
||||||
|
with msg.loading("Analyzing corpus..."):
|
||||||
|
train_data = read_json_object(train_data)
|
||||||
|
dev_data = read_json_object(dev_data)
|
||||||
|
corpus = GoldCorpus(train_data, dev_data)
|
||||||
|
train_docs = list(corpus.train_docs(nlp))
|
||||||
|
dev_docs = list(corpus.dev_docs(nlp))
|
||||||
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
|
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||||
|
gold_data = _compile_gold(train_docs, pipeline)
|
||||||
|
train_texts = gold_data["texts"]
|
||||||
|
dev_texts = set([doc.text for doc, gold in dev_docs])
|
||||||
|
|
||||||
|
msg.divider("Training stats")
|
||||||
|
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||||
|
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||||
|
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||||
|
if base_model:
|
||||||
|
msg.text("Starting with base model '{}'".format(base_model))
|
||||||
|
else:
|
||||||
|
msg.text("Starting with blank model '{}'".format(lang))
|
||||||
|
msg.text("{} training docs".format(len(train_docs)))
|
||||||
|
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||||
|
|
||||||
|
overlap = len(train_texts.intersection(dev_texts))
|
||||||
|
if overlap:
|
||||||
|
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||||
|
else:
|
||||||
|
msg.good("No overlap between training and evaluation data")
|
||||||
|
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
||||||
|
text = "Low number of examples to train from a blank model ({})".format(
|
||||||
|
len(train_docs)
|
||||||
|
)
|
||||||
|
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
|
msg.fail(text)
|
||||||
|
else:
|
||||||
|
msg.warn(text)
|
||||||
|
msg.text(
|
||||||
|
"It's recommended to use at least {} examples (minimum {})".format(
|
||||||
|
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
msg.divider("Vocab & Vectors")
|
||||||
|
n_words = gold_data["n_words"]
|
||||||
|
msg.info(
|
||||||
|
"{} total {} in the data ({} unique)".format(
|
||||||
|
n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
most_common_words = gold_data["words"].most_common(10)
|
||||||
|
msg.text(
|
||||||
|
"10 most common words: {}".format(
|
||||||
|
_format_labels(most_common_words, counts=True)
|
||||||
|
),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
if len(nlp.vocab.vectors):
|
||||||
|
msg.info(
|
||||||
|
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||||
|
len(nlp.vocab.vectors),
|
||||||
|
nlp.vocab.vectors.n_keys,
|
||||||
|
nlp.vocab.vectors_length,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg.info("No word vectors present in the model")
|
||||||
|
|
||||||
|
if "ner" in pipeline:
|
||||||
|
# Get all unique NER labels present in the data
|
||||||
|
labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
|
||||||
|
label_counts = gold_data["ner"]
|
||||||
|
model_labels = _get_labels_from_model(nlp, "ner")
|
||||||
|
new_labels = [l for l in labels if l not in model_labels]
|
||||||
|
existing_labels = [l for l in labels if l in model_labels]
|
||||||
|
has_low_data_warning = False
|
||||||
|
has_no_neg_warning = False
|
||||||
|
|
||||||
|
msg.divider("Named Entity Recognition")
|
||||||
|
msg.info(
|
||||||
|
"{} new {}, {} existing {}".format(
|
||||||
|
len(new_labels),
|
||||||
|
"label" if len(new_labels) == 1 else "labels",
|
||||||
|
len(existing_labels),
|
||||||
|
"label" if len(existing_labels) == 1 else "labels",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
missing_values = label_counts["-"]
|
||||||
|
msg.text(
|
||||||
|
"{} missing {} (tokens with '-' label)".format(
|
||||||
|
missing_values, "value" if missing_values == 1 else "values"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if new_labels:
|
||||||
|
labels_with_counts = [
|
||||||
|
(label, count)
|
||||||
|
for label, count in label_counts.most_common()
|
||||||
|
if label != "-"
|
||||||
|
]
|
||||||
|
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||||
|
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||||
|
if existing_labels:
|
||||||
|
msg.text(
|
||||||
|
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||||
|
)
|
||||||
|
|
||||||
|
for label in new_labels:
|
||||||
|
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||||
|
msg.warn(
|
||||||
|
"Low number of examples for new label '{}' ({})".format(
|
||||||
|
label, label_counts[label]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
has_low_data_warning = True
|
||||||
|
|
||||||
|
with msg.loading("Analyzing label distribution..."):
|
||||||
|
neg_docs = _get_examples_without_label(train_docs, label)
|
||||||
|
if neg_docs == 0:
|
||||||
|
msg.warn(
|
||||||
|
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||||
|
)
|
||||||
|
has_no_neg_warning = True
|
||||||
|
|
||||||
|
if not has_low_data_warning:
|
||||||
|
msg.good("Good amount of examples for all labels")
|
||||||
|
if not has_no_neg_warning:
|
||||||
|
msg.good("Examples without occurences available for all labels")
|
||||||
|
|
||||||
|
if has_low_data_warning:
|
||||||
|
msg.text(
|
||||||
|
"To train a new entity type, your data should include at "
|
||||||
|
"least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
if has_no_neg_warning:
|
||||||
|
msg.text(
|
||||||
|
"Training data should always include examples of entities "
|
||||||
|
"in context, as well as examples without a given entity "
|
||||||
|
"type.",
|
||||||
|
show=verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
if "textcat" in pipeline:
|
||||||
|
msg.divider("Text Classification")
|
||||||
|
labels = [label for label in gold_data["textcat"]]
|
||||||
|
model_labels = _get_labels_from_model(nlp, "textcat")
|
||||||
|
new_labels = [l for l in labels if l not in model_labels]
|
||||||
|
existing_labels = [l for l in labels if l in model_labels]
|
||||||
|
msg.info(
|
||||||
|
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||||
|
len(new_labels), len(existing_labels)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if new_labels:
|
||||||
|
labels_with_counts = _format_labels(
|
||||||
|
gold_data["textcat"].most_common(), counts=True
|
||||||
|
)
|
||||||
|
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||||
|
if existing_labels:
|
||||||
|
msg.text(
|
||||||
|
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||||
|
)
|
||||||
|
|
||||||
|
if "tagger" in pipeline:
|
||||||
|
msg.divider("Part-of-speech Tagging")
|
||||||
|
labels = [label for label in gold_data["tags"]]
|
||||||
|
tag_map = nlp.Defaults.tag_map
|
||||||
|
msg.info(
|
||||||
|
"{} {} in data ({} {} in tag map)".format(
|
||||||
|
len(labels),
|
||||||
|
"label" if len(labels) == 1 else "labels",
|
||||||
|
len(tag_map),
|
||||||
|
"label" if len(tag_map) == 1 else "labels",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels_with_counts = _format_labels(
|
||||||
|
gold_data["tags"].most_common(), counts=True
|
||||||
|
)
|
||||||
|
msg.text(labels_with_counts, show=verbose)
|
||||||
|
non_tagmap = [l for l in labels if l not in tag_map]
|
||||||
|
if not non_tagmap:
|
||||||
|
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||||
|
for label in non_tagmap:
|
||||||
|
msg.fail(
|
||||||
|
"Label '{}' not found in tag map for language '{}'".format(
|
||||||
|
label, nlp.lang
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if "parser" in pipeline:
|
||||||
|
msg.divider("Dependency Parsing")
|
||||||
|
labels = [label for label in gold_data["deps"]]
|
||||||
|
msg.info(
|
||||||
|
"{} {} in data".format(
|
||||||
|
len(labels), "label" if len(labels) == 1 else "labels"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
labels_with_counts = _format_labels(
|
||||||
|
gold_data["deps"].most_common(), counts=True
|
||||||
|
)
|
||||||
|
msg.text(labels_with_counts, show=verbose)
|
||||||
|
|
||||||
|
msg.divider("Summary")
|
||||||
|
good_counts = msg.counts[MESSAGES.GOOD]
|
||||||
|
warn_counts = msg.counts[MESSAGES.WARN]
|
||||||
|
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||||
|
if good_counts:
|
||||||
|
msg.good(
|
||||||
|
"{} {} passed".format(
|
||||||
|
good_counts, "check" if good_counts == 1 else "checks"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if warn_counts:
|
||||||
|
msg.warn(
|
||||||
|
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||||
|
)
|
||||||
|
if fail_counts:
|
||||||
|
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||||
|
|
||||||
|
if fail_counts:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_file(file_path, msg):
|
||||||
|
file_name = file_path.parts[-1]
|
||||||
|
if file_path.suffix == ".json":
|
||||||
|
data = read_json(file_path)
|
||||||
|
msg.good("Loaded {}".format(file_name))
|
||||||
|
return data
|
||||||
|
elif file_path.suffix == ".jsonl":
|
||||||
|
data = read_jsonl(file_path)
|
||||||
|
msg.good("Loaded {}".format(file_name))
|
||||||
|
return data
|
||||||
|
msg.fail(
|
||||||
|
"Can't load file extension {}".format(file_path.suffix),
|
||||||
|
"Expected .json or .jsonl",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _compile_gold(train_docs, pipeline):
|
||||||
|
data = {
|
||||||
|
"ner": Counter(),
|
||||||
|
"cats": Counter(),
|
||||||
|
"tags": Counter(),
|
||||||
|
"deps": Counter(),
|
||||||
|
"words": Counter(),
|
||||||
|
"n_words": 0,
|
||||||
|
"texts": set(),
|
||||||
|
}
|
||||||
|
for doc, gold in train_docs:
|
||||||
|
data["words"].update(gold.words)
|
||||||
|
data["n_words"] += len(gold.words)
|
||||||
|
data["texts"].add(doc.text)
|
||||||
|
if "ner" in pipeline:
|
||||||
|
for label in gold.ner:
|
||||||
|
if label.startswith(("B-", "U-")):
|
||||||
|
combined_label = label.split("-")[1]
|
||||||
|
data["ner"][combined_label] += 1
|
||||||
|
elif label == "-":
|
||||||
|
data["ner"]["-"] += 1
|
||||||
|
if "textcat" in pipeline:
|
||||||
|
data["cats"].update(gold.cats)
|
||||||
|
if "tagger" in pipeline:
|
||||||
|
data["tags"].update(gold.tags)
|
||||||
|
if "parser" in pipeline:
|
||||||
|
data["deps"].update(gold.labels)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _format_labels(labels, counts=False):
|
||||||
|
if counts:
|
||||||
|
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||||
|
return ", ".join(["'{}'".format(l) for l in labels])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ner_counts(data):
|
||||||
|
counter = Counter()
|
||||||
|
for doc, gold in data:
|
||||||
|
for label in gold.ner:
|
||||||
|
if label.startswith(("B-", "U-")):
|
||||||
|
combined_label = label.split("-")[1]
|
||||||
|
counter[combined_label] += 1
|
||||||
|
elif label == "-":
|
||||||
|
counter["-"] += 1
|
||||||
|
return counter
|
||||||
|
|
||||||
|
|
||||||
|
def _get_examples_without_label(data, label):
|
||||||
|
count = 0
|
||||||
|
for doc, gold in data:
|
||||||
|
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
||||||
|
if label not in labels:
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def _get_labels_from_model(nlp, pipe_name):
|
||||||
|
if pipe_name not in nlp.pipe_names:
|
||||||
|
return set()
|
||||||
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
|
return pipe.labels
|
|
@ -6,34 +6,37 @@ import requests
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from .link import link
|
from .link import link
|
||||||
from ..util import prints, get_package_path
|
from ..util import get_package_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
|
msg = Printer()
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model to download, shortcut or name", "positional", None, str),
|
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||||
direct=("force direct download. Needs model name with version and won't "
|
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||||
"perform compatibility check", "flag", "d", bool),
|
pip_args=("additional arguments to be passed to `pip install` on model install"),
|
||||||
pip_args=("additional arguments to be passed to `pip install` when "
|
)
|
||||||
"installing the model"))
|
|
||||||
def download(model, direct=False, *pip_args):
|
def download(model, direct=False, *pip_args):
|
||||||
"""
|
"""
|
||||||
Download compatible model from default download path using pip. Model
|
Download compatible model from default download path using pip. Model
|
||||||
can be shortcut, model name or, if --direct flag is set, full model name
|
can be shortcut, model name or, if --direct flag is set, full model name
|
||||||
with version.
|
with version. For direct downloads, the compatibility check will be skipped.
|
||||||
"""
|
"""
|
||||||
if direct:
|
if direct:
|
||||||
dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args)
|
dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
|
||||||
else:
|
else:
|
||||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||||
model_name = shortcuts.get(model, model)
|
model_name = shortcuts.get(model, model)
|
||||||
compatibility = get_compatibility()
|
compatibility = get_compatibility()
|
||||||
version = get_version(model_name, compatibility)
|
version = get_version(model_name, compatibility)
|
||||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}'
|
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
|
||||||
.format(m=model_name, v=version), pip_args)
|
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
if dl != 0: # if download subprocess doesn't return 0, exit
|
||||||
sys.exit(dl)
|
sys.exit(dl)
|
||||||
try:
|
try:
|
||||||
|
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
|
||||||
# subprocess
|
# subprocess
|
||||||
package_path = get_package_path(model_name)
|
package_path = get_package_path(model_name)
|
||||||
link(model_name, model, force=True, model_path=package_path)
|
link(model_name, model, force=True, model_path=package_path)
|
||||||
except:
|
except: # noqa: E722
|
||||||
# Dirty, but since spacy.download and the auto-linking is
|
# Dirty, but since spacy.download and the auto-linking is
|
||||||
# mostly a convenience wrapper, it's best to show a success
|
# mostly a convenience wrapper, it's best to show a success
|
||||||
# message and loading instructions, even if linking fails.
|
# message and loading instructions, even if linking fails.
|
||||||
prints(Messages.M001, title=Messages.M002.format(name=model_name))
|
msg.warn(Messages.M002.format(name=model_name), Messages.M001)
|
||||||
|
|
||||||
|
|
||||||
def get_json(url, desc):
|
def get_json(url, desc):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
prints(Messages.M004.format(desc=desc, version=about.__version__),
|
msg.fail(
|
||||||
title=Messages.M003.format(code=r.status_code), exits=1)
|
Messages.M003.format(code=r.status_code),
|
||||||
|
Messages.M004.format(desc=desc, version=about.__version__),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
def get_compatibility():
|
def get_compatibility():
|
||||||
version = about.__version__
|
version = about.__version__
|
||||||
version = version.rsplit('.dev', 1)[0]
|
version = version.rsplit(".dev", 1)[0]
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||||
comp = comp_table['spacy']
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
prints(Messages.M006.format(version=version), title=Messages.M005,
|
msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
|
||||||
exits=1)
|
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
def get_version(model, comp):
|
def get_version(model, comp):
|
||||||
model = model.rsplit('.dev', 1)[0]
|
model = model.rsplit(".dev", 1)[0]
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
prints(Messages.M007.format(name=model, version=about.__version__),
|
msg.fail(
|
||||||
title=Messages.M005, exits=1)
|
Messages.M005,
|
||||||
|
Messages.M007.format(name=model, version=about.__version__),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
||||||
|
|
||||||
def download_model(filename, user_pip_args=None):
|
def download_model(filename, user_pip_args=None):
|
||||||
download_url = about.__download_url__ + '/' + filename
|
download_url = about.__download_url__ + "/" + filename
|
||||||
pip_args = ['--no-cache-dir', '--no-deps']
|
pip_args = ["--no-cache-dir", "--no-deps"]
|
||||||
if user_pip_args:
|
if user_pip_args:
|
||||||
pip_args.extend(user_pip_args)
|
pip_args.extend(user_pip_args)
|
||||||
cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url]
|
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||||
return subprocess.call(cmd, env=os.environ.copy())
|
return subprocess.call(cmd, env=os.environ.copy())
|
||||||
|
|
|
@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..util import prints
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import displacy
|
from .. import displacy
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model name or path", "positional", None, str),
|
model=("Model name or path", "positional", None, str),
|
||||||
data_path=("location of JSON-formatted evaluation data", "positional",
|
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||||
None, str),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
gold_preproc=("use gold preprocessing", "flag", "G", bool),
|
gpu_id=("Use GPU", "option", "g", int),
|
||||||
gpu_id=("use GPU", "option", "g", int),
|
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||||
displacy_path=("directory to output rendered parses as HTML", "option",
|
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
||||||
"dp", str),
|
)
|
||||||
displacy_limit=("limit of parses to render as HTML", "option", "dl", int))
|
def evaluate(
|
||||||
def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None,
|
model,
|
||||||
displacy_limit=25):
|
data_path,
|
||||||
|
gpu_id=-1,
|
||||||
|
gold_preproc=False,
|
||||||
|
displacy_path=None,
|
||||||
|
displacy_limit=25,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||||
output directory as the displacy_path argument.
|
output directory as the displacy_path argument.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
if gpu_id >= 0:
|
if gpu_id >= 0:
|
||||||
util.use_gpu(gpu_id)
|
util.use_gpu(gpu_id)
|
||||||
|
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
||||||
data_path = util.ensure_path(data_path)
|
data_path = util.ensure_path(data_path)
|
||||||
displacy_path = util.ensure_path(displacy_path)
|
displacy_path = util.ensure_path(displacy_path)
|
||||||
if not data_path.exists():
|
if not data_path.exists():
|
||||||
prints(data_path, title=Messages.M034, exits=1)
|
msg.fail(Messages.M034, data_path, exits=1)
|
||||||
if displacy_path and not displacy_path.exists():
|
if displacy_path and not displacy_path.exists():
|
||||||
prints(displacy_path, title=Messages.M035, exits=1)
|
msg.fail(Messages.M035, displacy_path, exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||||
|
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
||||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||||
end = timer()
|
end = timer()
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||||
print_results(scorer, time=end - begin, words=nwords,
|
results = {
|
||||||
wps=nwords / (end - begin))
|
"Time": "%.2f s" % end - begin,
|
||||||
|
"Words": nwords,
|
||||||
|
"Words/s": "%.0f" % nwords / (end - begin),
|
||||||
|
"TOK": "%.2f" % scorer.token_acc,
|
||||||
|
"POS": "%.2f" % scorer.tags_acc,
|
||||||
|
"UAS": "%.2f" % scorer.uas,
|
||||||
|
"LAS": "%.2f" % scorer.las,
|
||||||
|
"NER P": "%.2f" % scorer.ents_p,
|
||||||
|
"NER R": "%.2f" % scorer.ents_r,
|
||||||
|
"NER F": "%.2f" % scorer.ents_f,
|
||||||
|
}
|
||||||
|
msg.table(results, title="Results")
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
docs, golds = zip(*dev_docs)
|
docs, golds = zip(*dev_docs)
|
||||||
render_deps = 'parser' in nlp.meta.get('pipeline', [])
|
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||||
render_parses(docs, displacy_path, model_name=model,
|
render_parses(
|
||||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
docs,
|
||||||
prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
|
displacy_path,
|
||||||
|
model_name=model,
|
||||||
|
limit=displacy_limit,
|
||||||
|
deps=render_deps,
|
||||||
|
ents=render_ents,
|
||||||
|
)
|
||||||
|
msg.good(Messages.M036.format(n=displacy_limit), displacy_path)
|
||||||
|
|
||||||
|
|
||||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
|
||||||
ents=True):
|
docs[0].user_data["title"] = model_name
|
||||||
docs[0].user_data['title'] = model_name
|
|
||||||
if ents:
|
if ents:
|
||||||
with (output_path / 'entities.html').open('w') as file_:
|
with (output_path / "entities.html").open("w") as file_:
|
||||||
html = displacy.render(docs[:limit], style='ent', page=True)
|
html = displacy.render(docs[:limit], style="ent", page=True)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
if deps:
|
if deps:
|
||||||
with (output_path / 'parses.html').open('w') as file_:
|
with (output_path / "parses.html").open("w") as file_:
|
||||||
html = displacy.render(docs[:limit], style='dep', page=True,
|
html = displacy.render(
|
||||||
options={'compact': True})
|
docs[:limit], style="dep", page=True, options={"compact": True}
|
||||||
|
)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, dev_scores, wps=0.0):
|
def print_progress(itn, losses, dev_scores, wps=0.0):
|
||||||
scores = {}
|
scores = {}
|
||||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
for col in [
|
||||||
'ents_p', 'ents_r', 'ents_f', 'wps']:
|
"dep_loss",
|
||||||
|
"tag_loss",
|
||||||
|
"uas",
|
||||||
|
"tags_acc",
|
||||||
|
"token_acc",
|
||||||
|
"ents_p",
|
||||||
|
"ents_r",
|
||||||
|
"ents_f",
|
||||||
|
"wps",
|
||||||
|
]:
|
||||||
scores[col] = 0.0
|
scores[col] = 0.0
|
||||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||||
scores.update(dev_scores)
|
scores.update(dev_scores)
|
||||||
scores['wps'] = wps
|
scores["wps"] = wps
|
||||||
tpl = '\t'.join((
|
tpl = "\t".join(
|
||||||
'{:d}',
|
(
|
||||||
'{dep_loss:.3f}',
|
"{:d}",
|
||||||
'{ner_loss:.3f}',
|
"{dep_loss:.3f}",
|
||||||
'{uas:.3f}',
|
"{ner_loss:.3f}",
|
||||||
'{ents_p:.3f}',
|
"{uas:.3f}",
|
||||||
'{ents_r:.3f}',
|
"{ents_p:.3f}",
|
||||||
'{ents_f:.3f}',
|
"{ents_r:.3f}",
|
||||||
'{tags_acc:.3f}',
|
"{ents_f:.3f}",
|
||||||
'{token_acc:.3f}',
|
"{tags_acc:.3f}",
|
||||||
'{wps:.1f}'))
|
"{token_acc:.3f}",
|
||||||
|
"{wps:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **scores))
|
print(tpl.format(itn, **scores))
|
||||||
|
|
||||||
|
|
||||||
def print_results(scorer, time, words, wps):
|
|
||||||
results = {
|
|
||||||
'Time': '%.2f s' % time,
|
|
||||||
'Words': words,
|
|
||||||
'Words/s': '%.0f' % wps,
|
|
||||||
'TOK': '%.2f' % scorer.token_acc,
|
|
||||||
'POS': '%.2f' % scorer.tags_acc,
|
|
||||||
'UAS': '%.2f' % scorer.uas,
|
|
||||||
'LAS': '%.2f' % scorer.las,
|
|
||||||
'NER P': '%.2f' % scorer.ents_p,
|
|
||||||
'NER R': '%.2f' % scorer.ents_r,
|
|
||||||
'NER F': '%.2f' % scorer.ents_f}
|
|
||||||
util.print_table(results, title="Results")
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str
|
from ..compat import path2str
|
||||||
|
@ -12,56 +13,65 @@ from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("optional: shortcut link of model", "positional", None, str),
|
model=("Optional shortcut link of model", "positional", None, str),
|
||||||
markdown=("generate Markdown for GitHub issues", "flag", "md", str),
|
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
||||||
silent=("don't print anything (just return)", "flag", "s"))
|
silent=("Don't print anything (just return)", "flag", "s"),
|
||||||
|
)
|
||||||
def info(model=None, markdown=False, silent=False):
|
def info(model=None, markdown=False, silent=False):
|
||||||
"""Print info about spaCy installation. If a model shortcut link is
|
"""
|
||||||
|
Print info about spaCy installation. If a model shortcut link is
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
if model:
|
if model:
|
||||||
if util.is_package(model):
|
if util.is_package(model):
|
||||||
model_path = util.get_package_path(model)
|
model_path = util.get_package_path(model)
|
||||||
else:
|
else:
|
||||||
model_path = util.get_data_path() / model
|
model_path = util.get_data_path() / model
|
||||||
meta_path = model_path / 'meta.json'
|
meta_path = model_path / "meta.json"
|
||||||
if not meta_path.is_file():
|
if not meta_path.is_file():
|
||||||
util.prints(meta_path, title=Messages.M020, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta['link'] = path2str(model_path)
|
meta["link"] = path2str(model_path)
|
||||||
meta['source'] = path2str(model_path.resolve())
|
meta["source"] = path2str(model_path.resolve())
|
||||||
else:
|
else:
|
||||||
meta['source'] = path2str(model_path)
|
meta["source"] = path2str(model_path)
|
||||||
if not silent:
|
if not silent:
|
||||||
print_info(meta, 'model %s' % model, markdown)
|
title = "Info about model '{}'".format(model)
|
||||||
|
model_meta = {
|
||||||
|
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||||
|
}
|
||||||
|
if markdown:
|
||||||
|
util.print_markdown(model_meta, title=title)
|
||||||
|
else:
|
||||||
|
msg.table(model_meta, title=title)
|
||||||
return meta
|
return meta
|
||||||
data = {'spaCy version': about.__version__,
|
data = {
|
||||||
'Location': path2str(Path(__file__).parent.parent),
|
"spaCy version": about.__version__,
|
||||||
'Platform': platform.platform(),
|
"Location": path2str(Path(__file__).parent.parent),
|
||||||
'Python version': platform.python_version(),
|
"Platform": platform.platform(),
|
||||||
'Models': list_models()}
|
"Python version": platform.python_version(),
|
||||||
|
"Models": list_models(),
|
||||||
|
}
|
||||||
if not silent:
|
if not silent:
|
||||||
print_info(data, 'spaCy', markdown)
|
title = "Info about spaCy"
|
||||||
|
if markdown:
|
||||||
|
util.print_markdown(data, title=title)
|
||||||
|
else:
|
||||||
|
msg.table(data, title=title)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def print_info(data, title, markdown):
|
|
||||||
title = 'Info about %s' % title
|
|
||||||
if markdown:
|
|
||||||
util.print_markdown(data, title=title)
|
|
||||||
else:
|
|
||||||
util.print_table(data, title=title)
|
|
||||||
|
|
||||||
|
|
||||||
def list_models():
|
def list_models():
|
||||||
def exclude_dir(dir_name):
|
def exclude_dir(dir_name):
|
||||||
# exclude common cache directories and hidden directories
|
# exclude common cache directories and hidden directories
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ("cache", "pycache", "__pycache__")
|
||||||
return dir_name in exclude or dir_name.startswith('.')
|
return dir_name in exclude or dir_name.startswith(".")
|
||||||
|
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
if data_path:
|
if data_path:
|
||||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||||
return ', '.join([m for m in models if not exclude_dir(m)])
|
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||||
return '-'
|
return "-"
|
||||||
|
|
|
@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
|
||||||
import tarfile
|
import tarfile
|
||||||
import gzip
|
import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import ujson as json
|
from wasabi import Printer
|
||||||
from spacy.lexeme import intify_attrs
|
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import prints, ensure_path, get_lang_class
|
from ..util import ensure_path, get_lang_class, read_jsonl
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
|
@ -25,121 +24,133 @@ except ImportError:
|
||||||
ftfy = None
|
ftfy = None
|
||||||
|
|
||||||
|
|
||||||
|
msg = Printer()
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("Model language", "positional", None, str),
|
||||||
output_dir=("model output directory", "positional", None, Path),
|
output_dir=("Model output directory", "positional", None, Path),
|
||||||
freqs_loc=("location of words frequencies file", "option", "f", Path),
|
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||||
jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path),
|
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||||
clusters_loc=("optional: location of brown clusters data",
|
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||||
"option", "c", str),
|
vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
|
||||||
vectors_loc=("optional: location of vectors file in Word2Vec format "
|
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||||
"(either as .txt or zipped as .zip or .tar.gz)", "option",
|
|
||||||
"v", str),
|
|
||||||
prune_vectors=("optional: number of vectors to prune to",
|
|
||||||
"option", "V", int)
|
|
||||||
)
|
)
|
||||||
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None,
|
def init_model(
|
||||||
vectors_loc=None, prune_vectors=-1):
|
lang,
|
||||||
|
output_dir,
|
||||||
|
freqs_loc=None,
|
||||||
|
clusters_loc=None,
|
||||||
|
jsonl_loc=None,
|
||||||
|
vectors_loc=None,
|
||||||
|
prune_vectors=-1,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
and word vectors.
|
and word vectors. If vectors are provided in Word2Vec format, they can
|
||||||
|
be either a .txt or zipped as a .zip or .tar.gz.
|
||||||
"""
|
"""
|
||||||
if jsonl_loc is not None:
|
if jsonl_loc is not None:
|
||||||
if freqs_loc is not None or clusters_loc is not None:
|
if freqs_loc is not None or clusters_loc is not None:
|
||||||
settings = ['-j']
|
settings = ["-j"]
|
||||||
if freqs_loc:
|
if freqs_loc:
|
||||||
settings.append('-f')
|
settings.append("-f")
|
||||||
if clusters_loc:
|
if clusters_loc:
|
||||||
settings.append('-c')
|
settings.append("-c")
|
||||||
prints(' '.join(settings),
|
msg.warn(Messages.M063, Messages.M064)
|
||||||
title=(
|
|
||||||
"The -f and -c arguments are deprecated, and not compatible "
|
|
||||||
"with the -j argument, which should specify the same information. "
|
|
||||||
"Either merge the frequencies and clusters data into the "
|
|
||||||
"jsonl-formatted file (recommended), or use only the -f and "
|
|
||||||
"-c files, without the other lexical attributes."))
|
|
||||||
jsonl_loc = ensure_path(jsonl_loc)
|
jsonl_loc = ensure_path(jsonl_loc)
|
||||||
lex_attrs = (json.loads(line) for line in jsonl_loc.open())
|
lex_attrs = read_jsonl(jsonl_loc)
|
||||||
else:
|
else:
|
||||||
clusters_loc = ensure_path(clusters_loc)
|
clusters_loc = ensure_path(clusters_loc)
|
||||||
freqs_loc = ensure_path(freqs_loc)
|
freqs_loc = ensure_path(freqs_loc)
|
||||||
if freqs_loc is not None and not freqs_loc.exists():
|
if freqs_loc is not None and not freqs_loc.exists():
|
||||||
prints(freqs_loc, title=Messages.M037, exits=1)
|
msg.fail(Messages.M037, freqs_loc, exits=1)
|
||||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||||
|
|
||||||
nlp = create_model(lang, lex_attrs)
|
with msg.loading("Creating model..."):
|
||||||
|
nlp = create_model(lang, lex_attrs)
|
||||||
|
msg.good("Successfully created model")
|
||||||
if vectors_loc is not None:
|
if vectors_loc is not None:
|
||||||
add_vectors(nlp, vectors_loc, prune_vectors)
|
add_vectors(nlp, vectors_loc, prune_vectors)
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
|
msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
|
||||||
title=Messages.M038)
|
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def open_file(loc):
|
def open_file(loc):
|
||||||
'''Handle .gz, .tar.gz or unzipped files'''
|
"""Handle .gz, .tar.gz or unzipped files"""
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
print("Open loc")
|
|
||||||
if tarfile.is_tarfile(str(loc)):
|
if tarfile.is_tarfile(str(loc)):
|
||||||
return tarfile.open(str(loc), 'r:gz')
|
return tarfile.open(str(loc), "r:gz")
|
||||||
elif loc.parts[-1].endswith('gz'):
|
elif loc.parts[-1].endswith("gz"):
|
||||||
return (line.decode('utf8') for line in gzip.open(str(loc), 'r'))
|
return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
|
||||||
elif loc.parts[-1].endswith('zip'):
|
elif loc.parts[-1].endswith("zip"):
|
||||||
zip_file = zipfile.ZipFile(str(loc))
|
zip_file = zipfile.ZipFile(str(loc))
|
||||||
names = zip_file.namelist()
|
names = zip_file.namelist()
|
||||||
file_ = zip_file.open(names[0])
|
file_ = zip_file.open(names[0])
|
||||||
return (line.decode('utf8') for line in file_)
|
return (line.decode("utf8") for line in file_)
|
||||||
else:
|
else:
|
||||||
return loc.open('r', encoding='utf8')
|
return loc.open("r", encoding="utf8")
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
with msg.loading("Counting frequencies..."):
|
||||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||||
|
msg.good("Counted frequencies")
|
||||||
|
with msg.loading("Reading clusters..."):
|
||||||
|
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||||
|
msg.good("Read clusters")
|
||||||
lex_attrs = []
|
lex_attrs = []
|
||||||
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
|
||||||
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
|
||||||
attrs = {'orth': word, 'id': i, 'prob': prob}
|
attrs = {"orth": word, "id": i, "prob": prob}
|
||||||
# Decode as a little-endian string, so that we can do & 15 to get
|
# Decode as a little-endian string, so that we can do & 15 to get
|
||||||
# the first 4 bits. See _parse_features.pyx
|
# the first 4 bits. See _parse_features.pyx
|
||||||
if word in clusters:
|
if word in clusters:
|
||||||
attrs['cluster'] = int(clusters[word][::-1], 2)
|
attrs["cluster"] = int(clusters[word][::-1], 2)
|
||||||
else:
|
else:
|
||||||
attrs['cluster'] = 0
|
attrs["cluster"] = 0
|
||||||
lex_attrs.append(attrs)
|
lex_attrs.append(attrs)
|
||||||
return lex_attrs
|
return lex_attrs
|
||||||
|
|
||||||
|
|
||||||
def create_model(lang, lex_attrs):
|
def create_model(lang, lex_attrs):
|
||||||
print("Creating model...")
|
|
||||||
lang_class = get_lang_class(lang)
|
lang_class = get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
for lexeme in nlp.vocab:
|
for lexeme in nlp.vocab:
|
||||||
lexeme.rank = 0
|
lexeme.rank = 0
|
||||||
lex_added = 0
|
lex_added = 0
|
||||||
for attrs in lex_attrs:
|
for attrs in lex_attrs:
|
||||||
if 'settings' in attrs:
|
if "settings" in attrs:
|
||||||
continue
|
continue
|
||||||
lexeme = nlp.vocab[attrs['orth']]
|
lexeme = nlp.vocab[attrs["orth"]]
|
||||||
lexeme.set_attrs(**attrs)
|
lexeme.set_attrs(**attrs)
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
lex_added += 1
|
lex_added += 1
|
||||||
oov_prob = min(lex.prob for lex in nlp.vocab)
|
oov_prob = min(lex.prob for lex in nlp.vocab)
|
||||||
nlp.vocab.cfg.update({'oov_prob': oov_prob-1})
|
nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def add_vectors(nlp, vectors_loc, prune_vectors):
|
def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith('.npz'):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb')))
|
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||||
for lex in nlp.vocab:
|
for lex in nlp.vocab:
|
||||||
if lex.rank:
|
if lex.rank:
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
if vectors_loc:
|
||||||
|
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||||
|
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||||
|
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||||
|
else:
|
||||||
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None:
|
if vector_keys is not None:
|
||||||
for word in vector_keys:
|
for word in vector_keys:
|
||||||
if word not in nlp.vocab:
|
if word not in nlp.vocab:
|
||||||
|
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||||
lexeme.is_oov = False
|
lexeme.is_oov = False
|
||||||
if vectors_data is not None:
|
if vectors_data is not None:
|
||||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||||
nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang']
|
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||||
nlp.meta['vectors']['name'] = nlp.vocab.vectors.name
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||||
if prune_vectors >= 1:
|
if prune_vectors >= 1:
|
||||||
nlp.vocab.prune_vectors(prune_vectors)
|
nlp.vocab.prune_vectors(prune_vectors)
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc):
|
||||||
print("Reading vectors from %s" % vectors_loc)
|
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype='f')
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
vectors_keys = []
|
vectors_keys = []
|
||||||
for i, line in enumerate(tqdm(f)):
|
for i, line in enumerate(tqdm(f)):
|
||||||
line = line.rstrip()
|
line = line.rstrip()
|
||||||
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
|
||||||
word = pieces.pop(0)
|
word = pieces.pop(0)
|
||||||
if len(pieces) != vectors_data.shape[1]:
|
if len(pieces) != vectors_data.shape[1]:
|
||||||
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
|
msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
print("Counting frequencies...")
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
total = 0
|
total = 0
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
for i, line in enumerate(f):
|
for i, line in enumerate(f):
|
||||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||||
freq = int(freq)
|
freq = int(freq)
|
||||||
counts.inc(i + 1, freq)
|
counts.inc(i + 1, freq)
|
||||||
total += freq
|
total += freq
|
||||||
|
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
probs = {}
|
probs = {}
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
for line in tqdm(f):
|
for line in tqdm(f):
|
||||||
freq, doc_freq, key = line.rstrip().split('\t', 2)
|
freq, doc_freq, key = line.rstrip().split("\t", 2)
|
||||||
doc_freq = int(doc_freq)
|
doc_freq = int(doc_freq)
|
||||||
freq = int(freq)
|
freq = int(freq)
|
||||||
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
|
||||||
|
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
print("Reading clusters...")
|
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
user_warning(Warnings.W004)
|
user_warning(Warnings.W004)
|
||||||
|
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
|
||||||
if int(freq) >= 3:
|
if int(freq) >= 3:
|
||||||
clusters[word] = cluster
|
clusters[word] = cluster
|
||||||
else:
|
else:
|
||||||
clusters[word] = '0'
|
clusters[word] = "0"
|
||||||
# Expand clusters with re-casing
|
# Expand clusters with re-casing
|
||||||
for word, cluster in list(clusters.items()):
|
for word, cluster in list(clusters.items()):
|
||||||
if word.lower() not in clusters:
|
if word.lower() not in clusters:
|
||||||
|
|
|
@ -3,51 +3,54 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import symlink_to, path2str
|
from ..compat import symlink_to, path2str
|
||||||
from ..util import prints
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
origin=("package name or local path to model", "positional", None, str),
|
origin=("package name or local path to model", "positional", None, str),
|
||||||
link_name=("name of shortuct link to create", "positional", None, str),
|
link_name=("name of shortuct link to create", "positional", None, str),
|
||||||
force=("force overwriting of existing link", "flag", "f", bool))
|
force=("force overwriting of existing link", "flag", "f", bool),
|
||||||
|
)
|
||||||
def link(origin, link_name, force=False, model_path=None):
|
def link(origin, link_name, force=False, model_path=None):
|
||||||
"""
|
"""
|
||||||
Create a symlink for models within the spacy/data directory. Accepts
|
Create a symlink for models within the spacy/data directory. Accepts
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
directory. Linking models allows loading them via spacy.load(link_name).
|
directory. Linking models allows loading them via spacy.load(link_name).
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
if util.is_package(origin):
|
if util.is_package(origin):
|
||||||
model_path = util.get_package_path(origin)
|
model_path = util.get_package_path(origin)
|
||||||
else:
|
else:
|
||||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
prints(Messages.M009.format(path=path2str(model_path)),
|
msg.fail(
|
||||||
title=Messages.M008, exits=1)
|
Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
|
||||||
|
)
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
spacy_loc = Path(__file__).parent.parent
|
spacy_loc = Path(__file__).parent.parent
|
||||||
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
|
msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
|
||||||
link_path = util.get_data_path() / link_name
|
link_path = util.get_data_path() / link_name
|
||||||
if link_path.is_symlink() and not force:
|
if link_path.is_symlink() and not force:
|
||||||
prints(Messages.M013, title=Messages.M012.format(name=link_name),
|
msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
|
||||||
exits=1)
|
|
||||||
elif link_path.is_symlink(): # does a symlink exist?
|
elif link_path.is_symlink(): # does a symlink exist?
|
||||||
# NB: It's important to check for is_symlink here and not for exists,
|
# NB: It's important to check for is_symlink here and not for exists,
|
||||||
# because invalid/outdated symlinks would return False otherwise.
|
# because invalid/outdated symlinks would return False otherwise.
|
||||||
link_path.unlink()
|
link_path.unlink()
|
||||||
elif link_path.exists(): # does it exist otherwise?
|
elif link_path.exists(): # does it exist otherwise?
|
||||||
# NB: Check this last because valid symlinks also "exist".
|
# NB: Check this last because valid symlinks also "exist".
|
||||||
prints(Messages.M015, link_path,
|
msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
|
||||||
title=Messages.M014.format(name=link_name), exits=1)
|
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||||
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
|
||||||
try:
|
try:
|
||||||
symlink_to(link_path, model_path)
|
symlink_to(link_path, model_path)
|
||||||
except:
|
except: # noqa: E722
|
||||||
# This is quite dirty, but just making sure other errors are caught.
|
# This is quite dirty, but just making sure other errors are caught.
|
||||||
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
|
msg.fail(Messages.M016.format(name=link_name), Messages.M017)
|
||||||
|
msg.text(details)
|
||||||
raise
|
raise
|
||||||
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
|
msg.good(Messages.M018, details)
|
||||||
|
msg.text(Messages.M019.format(name=link_name))
|
||||||
|
|
|
@ -4,109 +4,106 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from wasabi import Printer, get_raw_input
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, json_dumps
|
from ..compat import path2str, json_dumps
|
||||||
from ..util import prints
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_dir=("directory with model data", "positional", None, str),
|
input_dir=("Directory with model data", "positional", None, str),
|
||||||
output_dir=("output parent directory", "positional", None, str),
|
output_dir=("Output parent directory", "positional", None, str),
|
||||||
meta_path=("path to meta.json", "option", "m", str),
|
meta_path=("Path to meta.json", "option", "m", str),
|
||||||
create_meta=("create meta.json, even if one exists in directory – if "
|
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
||||||
"existing meta is found, entries are shown as defaults in "
|
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
||||||
"the command line prompt", "flag", "c", bool),
|
)
|
||||||
force=("force overwriting of existing model directory in output directory",
|
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||||
"flag", "f", bool))
|
|
||||||
def package(input_dir, output_dir, meta_path=None, create_meta=False,
|
|
||||||
force=False):
|
|
||||||
"""
|
"""
|
||||||
Generate Python package for model data, including meta and required
|
Generate Python package for model data, including meta and required
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
output directory, and model data will be copied over.
|
output directory, and model data will be copied over. If --create-meta is
|
||||||
|
set and a meta.json already exists in the output directory, the existing
|
||||||
|
values will be used as the defaults in the command-line prompt.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
prints(input_path, title=Messages.M008, exits=1)
|
msg.fail(Messages.M008, input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
prints(output_path, title=Messages.M040, exits=1)
|
msg.fail(Messages.M040, output_path, exits=1)
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
prints(meta_path, title=Messages.M020, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
|
|
||||||
meta_path = meta_path or input_path / 'meta.json'
|
meta_path = meta_path or input_path / "meta.json"
|
||||||
if meta_path.is_file():
|
if meta_path.is_file():
|
||||||
meta = util.read_json(meta_path)
|
meta = util.read_json(meta_path)
|
||||||
if not create_meta: # only print this if user doesn't want to overwrite
|
if not create_meta: # only print if user doesn't want to overwrite
|
||||||
prints(meta_path, title=Messages.M041)
|
msg.good(Messages.M041, meta_path)
|
||||||
else:
|
else:
|
||||||
meta = generate_meta(input_dir, meta)
|
meta = generate_meta(input_dir, meta, msg)
|
||||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
for key in ("lang", "name", "version"):
|
||||||
model_name = meta['lang'] + '_' + meta['name']
|
if key not in meta or meta[key] == "":
|
||||||
model_name_v = model_name + '-' + meta['version']
|
msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
|
||||||
|
model_name = meta["lang"] + "_" + meta["name"]
|
||||||
|
model_name_v = model_name + "-" + meta["version"]
|
||||||
main_path = output_path / model_name_v
|
main_path = output_path / model_name_v
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
|
||||||
shutil.copytree(path2str(input_path),
|
|
||||||
path2str(package_path / model_name_v))
|
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
|
||||||
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
|
|
||||||
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
|
|
||||||
create_file(package_path / '__init__.py', TEMPLATE_INIT)
|
|
||||||
prints(main_path, Messages.M043,
|
|
||||||
title=Messages.M042.format(name=model_name_v))
|
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(path2str(package_path))
|
shutil.rmtree(path2str(package_path))
|
||||||
else:
|
else:
|
||||||
prints(package_path, Messages.M045, title=Messages.M044, exits=1)
|
msg.fail(
|
||||||
|
Messages.M044,
|
||||||
|
Messages.M045.format(path=path2str(package_path)),
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||||
|
create_file(main_path / "meta.json", json_dumps(meta))
|
||||||
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
|
msg.good(Messages.M042.format(name=model_name_v), main_path)
|
||||||
|
msg.text(Messages.M043)
|
||||||
|
|
||||||
|
|
||||||
def create_file(file_path, contents):
|
def create_file(file_path, contents):
|
||||||
file_path.touch()
|
file_path.touch()
|
||||||
file_path.open('w', encoding='utf-8').write(contents)
|
file_path.open("w", encoding="utf-8").write(contents)
|
||||||
|
|
||||||
|
|
||||||
def generate_meta(model_path, existing_meta):
|
def generate_meta(model_path, existing_meta, msg):
|
||||||
meta = existing_meta or {}
|
meta = existing_meta or {}
|
||||||
settings = [('lang', 'Model language', meta.get('lang', 'en')),
|
settings = [
|
||||||
('name', 'Model name', meta.get('name', 'model')),
|
("lang", "Model language", meta.get("lang", "en")),
|
||||||
('version', 'Model version', meta.get('version', '0.0.0')),
|
("name", "Model name", meta.get("name", "model")),
|
||||||
('spacy_version', 'Required spaCy version',
|
("version", "Model version", meta.get("version", "0.0.0")),
|
||||||
'>=%s,<3.0.0' % about.__version__),
|
("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
|
||||||
('description', 'Model description',
|
("description", "Model description", meta.get("description", False)),
|
||||||
meta.get('description', False)),
|
("author", "Author", meta.get("author", False)),
|
||||||
('author', 'Author', meta.get('author', False)),
|
("email", "Author email", meta.get("email", False)),
|
||||||
('email', 'Author email', meta.get('email', False)),
|
("url", "Author website", meta.get("url", False)),
|
||||||
('url', 'Author website', meta.get('url', False)),
|
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||||
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
|
]
|
||||||
nlp = util.load_model_from_path(Path(model_path))
|
nlp = util.load_model_from_path(Path(model_path))
|
||||||
meta['pipeline'] = nlp.pipe_names
|
meta["pipeline"] = nlp.pipe_names
|
||||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
meta["vectors"] = {
|
||||||
'vectors': len(nlp.vocab.vectors),
|
"width": nlp.vocab.vectors_length,
|
||||||
'keys': nlp.vocab.vectors.n_keys}
|
"vectors": len(nlp.vocab.vectors),
|
||||||
prints(Messages.M047, title=Messages.M046)
|
"keys": nlp.vocab.vectors.n_keys,
|
||||||
|
}
|
||||||
|
msg.divider(Messages.M046)
|
||||||
|
msg.text(Messages.M047)
|
||||||
for setting, desc, default in settings:
|
for setting, desc, default in settings:
|
||||||
response = util.get_raw_input(desc, default)
|
response = get_raw_input(desc, default)
|
||||||
meta[setting] = default if response == '' and default else response
|
meta[setting] = default if response == "" and default else response
|
||||||
if about.__title__ != 'spacy':
|
if about.__title__ != "spacy":
|
||||||
meta['parent_package'] = about.__title__
|
meta["parent_package"] = about.__title__
|
||||||
return meta
|
|
||||||
|
|
||||||
|
|
||||||
def validate_meta(meta, keys):
|
|
||||||
for key in keys:
|
|
||||||
if key not in meta or meta[key] == '':
|
|
||||||
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
|
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,66 +1,148 @@
|
||||||
'''This script is experimental.
|
# coding: utf8
|
||||||
|
|
||||||
Try pre-training the CNN component of the text categorizer using a cheap
|
|
||||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
|
||||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
|
||||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
|
||||||
we're not merely doing compression here, because heavy dropout is applied,
|
|
||||||
including over the input words. This means the model must often (50% of the time)
|
|
||||||
use the context in order to predict the word.
|
|
||||||
|
|
||||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
|
||||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
|
||||||
pre-train with the development data, but also not *so* terrible: we're not using
|
|
||||||
the development labels, after all --- only the unlabelled text.
|
|
||||||
'''
|
|
||||||
from __future__ import print_function, unicode_literals
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
import ujson as json
|
import ujson
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
import spacy
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.attrs import ID, HEAD
|
|
||||||
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
|
|
||||||
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
from thinc.api import wrap
|
from thinc.api import wrap
|
||||||
from thinc.misc import LayerNorm as LN
|
from thinc.misc import LayerNorm as LN
|
||||||
|
from thinc.neural.util import prefer_gpu
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ..attrs import ID, HEAD
|
||||||
|
from ..compat import json_dumps
|
||||||
|
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
def prefer_gpu():
|
@plac.annotations(
|
||||||
used = spacy.util.use_gpu(0)
|
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
||||||
if used is None:
|
vectors_model=("Name or path to vectors model to learn from"),
|
||||||
return False
|
output_dir=("Directory to write models each epoch", "positional", None, str),
|
||||||
else:
|
width=("Width of CNN layers", "option", "cw", int),
|
||||||
import cupy.random
|
depth=("Depth of CNN layers", "option", "cd", int),
|
||||||
cupy.random.seed(0)
|
embed_rows=("Embedding rows", "option", "er", int),
|
||||||
return True
|
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||||
|
dropout=("Dropout", "option", "d", float),
|
||||||
|
seed=("Seed for random number generators", "option", "s", float),
|
||||||
|
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||||
|
)
|
||||||
|
def pretrain(
|
||||||
|
texts_loc,
|
||||||
|
vectors_model,
|
||||||
|
output_dir,
|
||||||
|
width=96,
|
||||||
|
depth=4,
|
||||||
|
embed_rows=2000,
|
||||||
|
use_vectors=False,
|
||||||
|
dropout=0.2,
|
||||||
|
nr_iter=1000,
|
||||||
|
seed=0,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
|
using an approximate language-modelling objective. Specifically, we load
|
||||||
|
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||||
|
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||||
|
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||||
|
files to the 'spacy train' command.
|
||||||
|
|
||||||
|
This technique may be especially helpful if you have little labelled data.
|
||||||
|
However, it's still quite experimental, so your mileage may vary.
|
||||||
|
|
||||||
def load_texts(path):
|
To load the weights back in during 'spacy train', you need to ensure
|
||||||
'''Load inputs from a jsonl file.
|
all settings are the same between pretraining and training. The API and
|
||||||
|
errors around this need some improvement.
|
||||||
Each line should be a dict like {"text": "..."}
|
"""
|
||||||
'''
|
config = dict(locals())
|
||||||
path = ensure_path(path)
|
msg = Printer()
|
||||||
with path.open('r', encoding='utf8') as file_:
|
util.fix_random_seed(seed)
|
||||||
texts = [json.loads(line) for line in file_]
|
|
||||||
random.shuffle(texts)
|
has_gpu = prefer_gpu()
|
||||||
return texts
|
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||||
|
|
||||||
|
output_dir = Path(output_dir)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
msg.good("Created output directory")
|
||||||
|
util.write_json(output_dir / "config.json", config)
|
||||||
|
msg.good("Saved settings to config.json")
|
||||||
|
|
||||||
|
# Load texts from file or stdin
|
||||||
|
if texts_loc != "-": # reading from a file
|
||||||
|
texts_loc = Path(texts_loc)
|
||||||
|
if not texts_loc.exists():
|
||||||
|
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||||
|
with msg.loading("Loading input texts..."):
|
||||||
|
texts = list(util.read_jsonl(texts_loc))
|
||||||
|
msg.good("Loaded input texts")
|
||||||
|
random.shuffle(texts)
|
||||||
|
else: # reading from stdin
|
||||||
|
msg.text("Reading input text from stdin...")
|
||||||
|
texts = stream_texts()
|
||||||
|
|
||||||
|
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||||
|
nlp = util.load_model(vectors_model)
|
||||||
|
msg.good("Loaded model '{}'".format(vectors_model))
|
||||||
|
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||||
|
model = create_pretraining_model(
|
||||||
|
nlp,
|
||||||
|
Tok2Vec(
|
||||||
|
width,
|
||||||
|
embed_rows,
|
||||||
|
conv_depth=depth,
|
||||||
|
pretrained_vectors=pretrained_vectors,
|
||||||
|
bilstm_depth=0, # Requires PyTorch. Experimental.
|
||||||
|
cnn_maxout_pieces=2, # You can try setting this higher
|
||||||
|
subword_features=True,
|
||||||
|
),
|
||||||
|
) # Set to False for character models, e.g. Chinese
|
||||||
|
optimizer = create_default_optimizer(model.ops)
|
||||||
|
tracker = ProgressTracker()
|
||||||
|
msg.divider("Pre-training tok2vec layer")
|
||||||
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
for epoch in range(nr_iter):
|
||||||
|
for batch in util.minibatch_by_words(
|
||||||
|
((text, None) for text in texts), size=5000
|
||||||
|
):
|
||||||
|
docs = make_docs(nlp, [text for (text, _) in batch])
|
||||||
|
loss = make_update(model, docs, optimizer, drop=dropout)
|
||||||
|
progress = tracker.update(epoch, loss, docs)
|
||||||
|
if progress:
|
||||||
|
msg.row(progress, **row_settings)
|
||||||
|
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||||
|
break
|
||||||
|
with model.use_params(optimizer.averages):
|
||||||
|
with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
|
||||||
|
file_.write(model.tok2vec.to_bytes())
|
||||||
|
log = {
|
||||||
|
"nr_word": tracker.nr_word,
|
||||||
|
"loss": tracker.loss,
|
||||||
|
"epoch_loss": tracker.epoch_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
}
|
||||||
|
with (output_dir / "log.jsonl").open("a") as file_:
|
||||||
|
file_.write(json_dumps(log) + "\n")
|
||||||
|
tracker.epoch_loss = 0.0
|
||||||
|
if texts_loc != "-":
|
||||||
|
# Reshuffle the texts if texts were loaded from a file
|
||||||
|
random.shuffle(texts)
|
||||||
|
|
||||||
|
|
||||||
def stream_texts():
|
def stream_texts():
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
yield json.loads(line)
|
yield ujson.loads(line)
|
||||||
|
|
||||||
|
|
||||||
def make_update(model, docs, optimizer, drop=0.):
|
def make_update(model, docs, optimizer, drop=0.0):
|
||||||
"""Perform an update over a single batch of documents.
|
"""Perform an update over a single batch of documents.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
|
||||||
# Don't want to return a cupy object here
|
# Don't want to return a cupy object here
|
||||||
# The gradients are modified in-place by the BERT MLM,
|
# The gradients are modified in-place by the BERT MLM,
|
||||||
# so we get an accurate loss
|
# so we get an accurate loss
|
||||||
loss = float((gradients**2).mean())
|
loss = float((gradients ** 2).mean())
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
@ -98,7 +180,7 @@ def make_docs(nlp, batch):
|
||||||
|
|
||||||
def get_vectors_loss(ops, docs, prediction):
|
def get_vectors_loss(ops, docs, prediction):
|
||||||
"""Compute a mean-squared error loss between the documents' vectors and
|
"""Compute a mean-squared error loss between the documents' vectors and
|
||||||
the prediction.
|
the prediction.
|
||||||
|
|
||||||
Note that this is ripe for customization! We could compute the vectors
|
Note that this is ripe for customization! We could compute the vectors
|
||||||
in some other word, e.g. with an LSTM language model, or use some other
|
in some other word, e.g. with an LSTM language model, or use some other
|
||||||
|
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):
|
||||||
|
|
||||||
|
|
||||||
def create_pretraining_model(nlp, tok2vec):
|
def create_pretraining_model(nlp, tok2vec):
|
||||||
'''Define a network for the pretraining. We simply add an output layer onto
|
"""Define a network for the pretraining. We simply add an output layer onto
|
||||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||||
Each array in the output needs to have one row per token in the doc.
|
Each array in the output needs to have one row per token in the doc.
|
||||||
'''
|
"""
|
||||||
output_size = nlp.vocab.vectors.data.shape[1]
|
output_size = nlp.vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
LN(Maxout(300, pieces=3)),
|
LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
|
||||||
zero_init(Affine(output_size, drop_factor=0.0))
|
|
||||||
)
|
)
|
||||||
# This is annoying, but the parser etc have the flatten step after
|
# This is annoying, but the parser etc have the flatten step after
|
||||||
# the tok2vec. To load the weights in cleanly, we need to match
|
# the tok2vec. To load the weights in cleanly, we need to match
|
||||||
# the shape of the models' components exactly. So what we cann
|
# the shape of the models' components exactly. So what we cann
|
||||||
# "tok2vec" has to be the same set of processes as what the components do.
|
# "tok2vec" has to be the same set of processes as what the components do.
|
||||||
tok2vec = chain(tok2vec, flatten)
|
tok2vec = chain(tok2vec, flatten)
|
||||||
model = chain(
|
model = chain(tok2vec, output_layer)
|
||||||
tok2vec,
|
|
||||||
output_layer
|
|
||||||
)
|
|
||||||
model = masked_language_model(nlp.vocab, model)
|
model = masked_language_model(nlp.vocab, model)
|
||||||
model.tok2vec = tok2vec
|
model.tok2vec = tok2vec
|
||||||
model.output_layer = output_layer
|
model.output_layer = output_layer
|
||||||
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')])
|
model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def masked_language_model(vocab, model, mask_prob=0.15):
|
def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
'''Convert a model into a BERT-style masked language model'''
|
"""Convert a model into a BERT-style masked language model"""
|
||||||
|
|
||||||
random_words = RandomWords(vocab)
|
random_words = RandomWords(vocab)
|
||||||
def mlm_forward(docs, drop=0.):
|
|
||||||
|
def mlm_forward(docs, drop=0.0):
|
||||||
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||||
output, backprop = model.begin_update(docs, drop=drop)
|
output, backprop = model.begin_update(docs, drop=drop)
|
||||||
|
|
||||||
def mlm_backward(d_output, sgd=None):
|
def mlm_backward(d_output, sgd=None):
|
||||||
d_output *= 1-mask
|
d_output *= 1 - mask
|
||||||
return backprop(d_output, sgd=sgd)
|
return backprop(d_output, sgd=sgd)
|
||||||
|
|
||||||
return output, mlm_backward
|
return output, mlm_backward
|
||||||
|
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
|
||||||
|
|
||||||
def apply_mask(docs, random_words, mask_prob=0.15):
|
def apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
N = sum(len(doc) for doc in docs)
|
N = sum(len(doc) for doc in docs)
|
||||||
mask = numpy.random.uniform(0., 1.0, (N,))
|
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||||
mask = mask >= mask_prob
|
mask = mask >= mask_prob
|
||||||
i = 0
|
i = 0
|
||||||
masked_docs = []
|
masked_docs = []
|
||||||
|
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
return mask, masked_docs
|
return mask, masked_docs
|
||||||
|
|
||||||
|
|
||||||
def replace_word(word, random_words, mask='[MASK]'):
|
def replace_word(word, random_words, mask="[MASK]"):
|
||||||
roll = random.random()
|
roll = random.random()
|
||||||
if roll < 0.8:
|
if roll < 0.8:
|
||||||
return mask
|
return mask
|
||||||
|
@ -193,23 +272,25 @@ def replace_word(word, random_words, mask='[MASK]'):
|
||||||
else:
|
else:
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
class RandomWords(object):
|
class RandomWords(object):
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab):
|
||||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||||
self.words = self.words[:10000]
|
self.words = self.words[:10000]
|
||||||
self.probs = self.probs[:10000]
|
self.probs = self.probs[:10000]
|
||||||
self.probs = numpy.exp(numpy.array(self.probs, dtype='f'))
|
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
||||||
self.probs /= self.probs.sum()
|
self.probs /= self.probs.sum()
|
||||||
self._cache = []
|
self._cache = []
|
||||||
|
|
||||||
def next(self):
|
def next(self):
|
||||||
if not self._cache:
|
if not self._cache:
|
||||||
self._cache.extend(numpy.random.choice(len(self.words), 10000,
|
self._cache.extend(
|
||||||
p=self.probs))
|
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||||
|
)
|
||||||
index = self._cache.pop()
|
index = self._cache.pop()
|
||||||
return self.words[index]
|
return self.words[index]
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker(object):
|
class ProgressTracker(object):
|
||||||
def __init__(self, frequency=1000000):
|
def __init__(self, frequency=1000000):
|
||||||
|
@ -245,76 +326,3 @@ class ProgressTracker(object):
|
||||||
return status
|
return status
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
|
|
||||||
vectors_model=("Name or path to vectors model to learn from"),
|
|
||||||
output_dir=("Directory to write models each epoch", "positional", None, str),
|
|
||||||
width=("Width of CNN layers", "option", "cw", int),
|
|
||||||
depth=("Depth of CNN layers", "option", "cd", int),
|
|
||||||
embed_rows=("Embedding rows", "option", "er", int),
|
|
||||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
|
||||||
dropout=("Dropout", "option", "d", float),
|
|
||||||
seed=("Seed for random number generators", "option", "s", float),
|
|
||||||
nr_iter=("Number of iterations to pretrain", "option", "i", int),
|
|
||||||
)
|
|
||||||
def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
|
|
||||||
embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
|
|
||||||
"""
|
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
|
||||||
using an approximate language-modelling objective. Specifically, we load
|
|
||||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
|
||||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
|
||||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
|
||||||
files to the 'spacy train' command.
|
|
||||||
|
|
||||||
This technique may be especially helpful if you have little labelled data.
|
|
||||||
However, it's still quite experimental, so your mileage may vary.
|
|
||||||
|
|
||||||
To load the weights back in during 'spacy train', you need to ensure
|
|
||||||
all settings are the same between pretraining and training. The API and
|
|
||||||
errors around this need some improvement.
|
|
||||||
"""
|
|
||||||
config = dict(locals())
|
|
||||||
output_dir = ensure_path(output_dir)
|
|
||||||
random.seed(seed)
|
|
||||||
numpy.random.seed(seed)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
with (output_dir / 'config.json').open('w') as file_:
|
|
||||||
file_.write(json.dumps(config))
|
|
||||||
has_gpu = prefer_gpu()
|
|
||||||
print("Use GPU?", has_gpu)
|
|
||||||
nlp = spacy.load(vectors_model)
|
|
||||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
|
||||||
model = create_pretraining_model(nlp,
|
|
||||||
Tok2Vec(width, embed_rows,
|
|
||||||
conv_depth=depth,
|
|
||||||
pretrained_vectors=pretrained_vectors,
|
|
||||||
bilstm_depth=0, # Requires PyTorch. Experimental.
|
|
||||||
cnn_maxout_pieces=2, # You can try setting this higher
|
|
||||||
subword_features=True)) # Set to False for character models, e.g. Chinese
|
|
||||||
optimizer = create_default_optimizer(model.ops)
|
|
||||||
tracker = ProgressTracker()
|
|
||||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
|
||||||
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
|
||||||
for epoch in range(nr_iter):
|
|
||||||
for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
|
|
||||||
docs = make_docs(nlp, [text for (text, _) in batch])
|
|
||||||
loss = make_update(model, docs, optimizer, drop=dropout)
|
|
||||||
progress = tracker.update(epoch, loss, docs)
|
|
||||||
if progress:
|
|
||||||
print(*progress)
|
|
||||||
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
|
|
||||||
break
|
|
||||||
with model.use_params(optimizer.averages):
|
|
||||||
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
|
|
||||||
file_.write(model.tok2vec.to_bytes())
|
|
||||||
with (output_dir / 'log.jsonl').open('a') as file_:
|
|
||||||
file_.write(json.dumps({'nr_word': tracker.nr_word,
|
|
||||||
'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
|
|
||||||
'epoch': epoch}) + '\n')
|
|
||||||
tracker.epoch_loss = 0.0
|
|
||||||
if texts_loc != '-':
|
|
||||||
texts = load_texts(texts_loc)
|
|
||||||
|
|
|
@ -6,45 +6,64 @@ from pathlib import Path
|
||||||
import ujson
|
import ujson
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
|
|
||||||
import spacy
|
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
import tqdm
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
|
from ..util import load_model
|
||||||
def read_inputs(loc):
|
|
||||||
if loc is None:
|
|
||||||
file_ = sys.stdin
|
|
||||||
file_ = (line.encode('utf8') for line in file_)
|
|
||||||
else:
|
|
||||||
file_ = Path(loc).open()
|
|
||||||
for line in file_:
|
|
||||||
data = ujson.loads(line)
|
|
||||||
text = data['text']
|
|
||||||
yield text
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model/language", "positional", None, str),
|
model=("Model to load", "positional", None, str),
|
||||||
inputs=("Location of input file", "positional", None, read_inputs))
|
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
||||||
def profile(lang, inputs=None):
|
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
||||||
|
)
|
||||||
|
def profile(model, inputs=None, n_texts=10000):
|
||||||
"""
|
"""
|
||||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||||
|
Input should be formatted as one JSON object per line with a key "text".
|
||||||
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||||
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
|
if inputs is not None:
|
||||||
|
inputs = _read_inputs(inputs, msg)
|
||||||
if inputs is None:
|
if inputs is None:
|
||||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
n_inputs = 25000
|
||||||
inputs, _ = zip(*imdb_train)
|
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||||
inputs = inputs[:25000]
|
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||||
nlp = spacy.load(lang)
|
inputs, _ = zip(*imdb_train)
|
||||||
texts = list(cytoolz.take(10000, inputs))
|
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
|
inputs = inputs[:n_inputs]
|
||||||
"Profile.prof")
|
with msg.loading("Loading model '{}'...".format(model)):
|
||||||
|
nlp = load_model(model)
|
||||||
|
msg.good("Loaded model '{}'".format(model))
|
||||||
|
texts = list(cytoolz.take(n_texts, inputs))
|
||||||
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
|
msg.divider("Profile stats")
|
||||||
s.strip_dirs().sort_stats("time").print_stats()
|
s.strip_dirs().sort_stats("time").print_stats()
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp, texts):
|
def parse_texts(nlp, texts):
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _read_inputs(loc, msg):
|
||||||
|
if loc == "-":
|
||||||
|
msg.info("Reading input from sys.stdin")
|
||||||
|
file_ = sys.stdin
|
||||||
|
file_ = (line.encode("utf8") for line in file_)
|
||||||
|
else:
|
||||||
|
input_path = Path(loc)
|
||||||
|
if not input_path.exists() or not input_path.is_file():
|
||||||
|
msg.fail("Not a valid input data file", loc, exits=1)
|
||||||
|
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||||
|
file_ = input_path.open()
|
||||||
|
for line in file_:
|
||||||
|
data = ujson.loads(line)
|
||||||
|
text = data["text"]
|
||||||
|
yield text
|
||||||
|
|
51
spacy/cli/schemas/__init__.py
Normal file
51
spacy/cli/schemas/__init__.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from jsonschema import Draft4Validator
|
||||||
|
|
||||||
|
from ...errors import Errors
|
||||||
|
from ...util import read_json
|
||||||
|
|
||||||
|
|
||||||
|
SCHEMAS = {}
|
||||||
|
|
||||||
|
|
||||||
|
def get_schema(name):
|
||||||
|
"""Get the JSON schema for a given name. Looks for a .json file in
|
||||||
|
spacy.cli.schemas, validates the schema and raises ValueError if not found.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> schema = get_schema('training')
|
||||||
|
|
||||||
|
name (unicode): The name of the schema.
|
||||||
|
RETURNS (dict): The JSON schema.
|
||||||
|
"""
|
||||||
|
if name not in SCHEMAS:
|
||||||
|
schema_path = Path(__file__).parent / "{}.json".format(name)
|
||||||
|
if not schema_path.exists():
|
||||||
|
raise ValueError(Errors.E104.format(name=name))
|
||||||
|
schema = read_json(schema_path)
|
||||||
|
# TODO: replace with (stable) Draft6Validator, if available
|
||||||
|
validator = Draft4Validator(schema)
|
||||||
|
validator.check_schema(schema)
|
||||||
|
SCHEMAS[name] = schema
|
||||||
|
return SCHEMAS[name]
|
||||||
|
|
||||||
|
|
||||||
|
def validate_json(data, schema):
|
||||||
|
"""Validate data against a given JSON schema (see https://json-schema.org).
|
||||||
|
|
||||||
|
data: JSON-serializable data to validate.
|
||||||
|
schema (dict): The JSON schema.
|
||||||
|
RETURNS (list): A list of error messages, if available.
|
||||||
|
"""
|
||||||
|
validator = Draft4Validator(schema)
|
||||||
|
errors = []
|
||||||
|
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
|
||||||
|
if err.path:
|
||||||
|
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
|
||||||
|
else:
|
||||||
|
err_path = ""
|
||||||
|
errors.append(err.message + " " + err_path)
|
||||||
|
return errors
|
128
spacy/cli/schemas/meta.json
Normal file
128
spacy/cli/schemas/meta.json
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/draft-06/schema",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"lang": {
|
||||||
|
"title": "Two-letter language code, e.g. 'en'",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 2,
|
||||||
|
"maxLength": 2,
|
||||||
|
"pattern": "^[a-z]*$"
|
||||||
|
},
|
||||||
|
"name": {
|
||||||
|
"title": "Model name",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[a-z_]*$"
|
||||||
|
},
|
||||||
|
"version": {
|
||||||
|
"title": "Model version",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[0-9a-z.-]*$"
|
||||||
|
},
|
||||||
|
"spacy_version": {
|
||||||
|
"title": "Compatible spaCy version identifier",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[0-9a-z.-><=]*$"
|
||||||
|
},
|
||||||
|
"parent_package": {
|
||||||
|
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"default": "spacy"
|
||||||
|
},
|
||||||
|
"pipeline": {
|
||||||
|
"title": "Names of pipeline components",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"description": {
|
||||||
|
"title": "Model description",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"license": {
|
||||||
|
"title": "Model license",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"author": {
|
||||||
|
"title": "Model author name",
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"title": "Model author email",
|
||||||
|
"type": "string",
|
||||||
|
"format": "email"
|
||||||
|
},
|
||||||
|
"url": {
|
||||||
|
"title": "Model author URL",
|
||||||
|
"type": "string",
|
||||||
|
"format": "uri"
|
||||||
|
},
|
||||||
|
"sources": {
|
||||||
|
"title": "Training data sources",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"vectors": {
|
||||||
|
"title": "Included word vectors",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"keys": {
|
||||||
|
"title": "Number of unique keys",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"vectors": {
|
||||||
|
"title": "Number of unique vectors",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"width": {
|
||||||
|
"title": "Number of dimensions",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"accuracy": {
|
||||||
|
"title": "Accuracy numbers",
|
||||||
|
"type": "object",
|
||||||
|
"patternProperties": {
|
||||||
|
"*": {
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"speed": {
|
||||||
|
"title": "Speed evaluation numbers",
|
||||||
|
"type": "object",
|
||||||
|
"patternProperties": {
|
||||||
|
"*": {
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0.0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"lang",
|
||||||
|
"name",
|
||||||
|
"version"
|
||||||
|
]
|
||||||
|
}
|
146
spacy/cli/schemas/training.json
Normal file
146
spacy/cli/schemas/training.json
Normal file
|
@ -0,0 +1,146 @@
|
||||||
|
{
|
||||||
|
"$schema": "http://json-schema.org/draft-06/schema",
|
||||||
|
"title": "Training data for spaCy models",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"text": {
|
||||||
|
"title": "The text of the training example",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"ents": {
|
||||||
|
"title": "Named entity spans in the text",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"start": {
|
||||||
|
"title": "Start character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"title": "End character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"label": {
|
||||||
|
"title": "Entity label",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1,
|
||||||
|
"pattern": "^[A-Z0-9]*$"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"start",
|
||||||
|
"end",
|
||||||
|
"label"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sents": {
|
||||||
|
"title": "Sentence spans in the text",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"start": {
|
||||||
|
"title": "Start character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"title": "End character offset of the span",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"start",
|
||||||
|
"end"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cats": {
|
||||||
|
"title": "Text categories for the text classifier",
|
||||||
|
"type": "object",
|
||||||
|
"patternProperties": {
|
||||||
|
"*": {
|
||||||
|
"title": "A text category",
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "boolean"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "number",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"propertyNames": {
|
||||||
|
"pattern": "^[A-Z0-9]*$",
|
||||||
|
"minLength": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tokens": {
|
||||||
|
"title": "The tokens in the text",
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"minProperties": 1,
|
||||||
|
"properties": {
|
||||||
|
"id": {
|
||||||
|
"title": "Token ID, usually token index",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"start": {
|
||||||
|
"title": "Start character offset of the token",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"end": {
|
||||||
|
"title": "End character offset of the token",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
},
|
||||||
|
"pos": {
|
||||||
|
"title": "Coarse-grained part-of-speech tag",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"tag": {
|
||||||
|
"title": "Fine-grained part-of-speech tag",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"dep": {
|
||||||
|
"title": "Dependency label",
|
||||||
|
"type": "string",
|
||||||
|
"minLength": 1
|
||||||
|
},
|
||||||
|
"head": {
|
||||||
|
"title": "Index of the token's head",
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"start",
|
||||||
|
"end"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"_": {
|
||||||
|
"title": "Custom user space",
|
||||||
|
"type": "object"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": [
|
||||||
|
"text"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,213 +6,296 @@ from pathlib import Path
|
||||||
import tqdm
|
import tqdm
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import json
|
|
||||||
import shutil
|
import shutil
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
|
from .._ml import create_default_optimizer
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..util import prints, minibatch, minibatch_by_words
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
from .. import displacy
|
|
||||||
from ..compat import json_dumps
|
|
||||||
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
|
# at the beginning of training.
|
||||||
|
dropout_rates = util.decaying(
|
||||||
|
util.env_opt("dropout_from", 0.2),
|
||||||
|
util.env_opt("dropout_to", 0.2),
|
||||||
|
util.env_opt("dropout_decay", 0.0),
|
||||||
|
)
|
||||||
|
batch_sizes = util.compounding(
|
||||||
|
util.env_opt("batch_from", 1000),
|
||||||
|
util.env_opt("batch_to", 1000),
|
||||||
|
util.env_opt("batch_compound", 1.001),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
lang=("model language", "positional", None, str),
|
lang=("Model language", "positional", None, str),
|
||||||
output_dir=("output directory to store model in", "positional", None, str),
|
output_path=("Output directory to store model in", "positional", None, Path),
|
||||||
train_data=("location of JSON-formatted training data", "positional",
|
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||||
None, str),
|
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||||
dev_data=("location of JSON-formatted development data (optional)",
|
base_model=("Name of model to update (optional)", "option", "b", str),
|
||||||
"positional", None, str),
|
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
vectors=("Model to load vectors from", "option", "v", str),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_iter=("Number of iterations", "option", "n", int),
|
||||||
|
n_examples=("Number of examples", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
vectors=("Model to load vectors from", "option", "v"),
|
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
|
||||||
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
|
|
||||||
noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
|
|
||||||
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
|
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
|
||||||
version=("Model version", "option", "V", str),
|
version=("Model version", "option", "V", str),
|
||||||
meta_path=("Optional path to meta.json. All relevant properties will be "
|
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
||||||
"overwritten.", "option", "m", Path),
|
init_tok2vec=(
|
||||||
init_tok2vec=("Path to pretrained weights for the token-to-vector parts "
|
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
||||||
"of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
|
"option",
|
||||||
verbose=("Display more information for debug", "option", None, bool))
|
"t2v",
|
||||||
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
Path,
|
||||||
parser_multitasks='', entity_multitasks='', init_tok2vec=None,
|
),
|
||||||
use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0,
|
parser_multitasks=(
|
||||||
no_parser=False, no_entities=False, gold_preproc=False,
|
"Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
|
||||||
version="0.0.0", meta_path=None, verbose=False):
|
"option",
|
||||||
|
"pt",
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
entity_multitasks=(
|
||||||
|
"Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
|
||||||
|
"option",
|
||||||
|
"et",
|
||||||
|
str,
|
||||||
|
),
|
||||||
|
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||||
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
|
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||||
|
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||||
|
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||||
|
)
|
||||||
|
def train(
|
||||||
|
lang,
|
||||||
|
output_path,
|
||||||
|
train_path,
|
||||||
|
dev_path,
|
||||||
|
base_model=None,
|
||||||
|
pipeline="tagger,parser,ner",
|
||||||
|
vectors=None,
|
||||||
|
n_iter=30,
|
||||||
|
n_examples=0,
|
||||||
|
use_gpu=-1,
|
||||||
|
version="0.0.0",
|
||||||
|
meta_path=None,
|
||||||
|
init_tok2vec=None,
|
||||||
|
parser_multitasks="",
|
||||||
|
entity_multitasks="",
|
||||||
|
noise_level=0.0,
|
||||||
|
gold_preproc=False,
|
||||||
|
learn_tokens=False,
|
||||||
|
verbose=False,
|
||||||
|
debug=False,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||||
|
JSON format. To convert data from other formats, use the `spacy convert`
|
||||||
|
command.
|
||||||
"""
|
"""
|
||||||
|
msg = Printer()
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
util.set_env_log(True)
|
util.set_env_log(verbose)
|
||||||
n_sents = n_sents or None
|
|
||||||
output_path = util.ensure_path(output_dir)
|
# Make sure all files and paths exists if they are needed
|
||||||
train_path = util.ensure_path(train_data)
|
train_path = util.ensure_path(train_path)
|
||||||
dev_path = util.ensure_path(dev_data)
|
dev_path = util.ensure_path(dev_path)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if not train_path.exists():
|
if not train_path or not train_path.exists():
|
||||||
prints(train_path, title=Messages.M050, exits=1)
|
msg.fail(Messages.M050, train_path, exits=1)
|
||||||
if dev_path and not dev_path.exists():
|
if not dev_path or not dev_path.exists():
|
||||||
prints(dev_path, title=Messages.M051, exits=1)
|
msg.fail(Messages.M051, dev_path, exits=1)
|
||||||
if meta_path is not None and not meta_path.exists():
|
if meta_path is not None and not meta_path.exists():
|
||||||
prints(meta_path, title=Messages.M020, exits=1)
|
msg.fail(Messages.M020, meta_path, exits=1)
|
||||||
meta = util.read_json(meta_path) if meta_path else {}
|
meta = util.read_json(meta_path) if meta_path else {}
|
||||||
if not isinstance(meta, dict):
|
if not isinstance(meta, dict):
|
||||||
prints(Messages.M053.format(meta_type=type(meta)),
|
msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
|
||||||
title=Messages.M052, exits=1)
|
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||||
meta.setdefault('lang', lang)
|
msg.fail(Messages.M062, Messages.M065)
|
||||||
meta.setdefault('name', 'unnamed')
|
|
||||||
|
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
output_path.mkdir()
|
output_path.mkdir()
|
||||||
|
|
||||||
print("Counting training words (limit=%s" % n_sents)
|
# Set up the base model and pipeline. If a base model is specified, load
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
# the model and make sure the pipeline matches the pipeline setting. If
|
||||||
n_train_words = corpus.count_train()
|
# training starts from a blank model, intitalize the language class.
|
||||||
print(n_train_words)
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
pipeline = ['tagger', 'parser', 'ner']
|
msg.text(Messages.M055.format(pipeline=pipeline))
|
||||||
if no_tagger and 'tagger' in pipeline:
|
if base_model:
|
||||||
pipeline.remove('tagger')
|
msg.text(Messages.M056.format(model=base_model))
|
||||||
if no_parser and 'parser' in pipeline:
|
nlp = util.load_model(base_model)
|
||||||
pipeline.remove('parser')
|
if nlp.lang != lang:
|
||||||
if no_entities and 'ner' in pipeline:
|
msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
|
||||||
pipeline.remove('ner')
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
|
||||||
|
nlp.disable_pipes(*other_pipes)
|
||||||
|
for pipe in pipeline:
|
||||||
|
if pipe not in nlp.pipe_names:
|
||||||
|
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||||
|
else:
|
||||||
|
msg.text(Messages.M057.format(model=lang))
|
||||||
|
lang_cls = util.get_lang_class(lang)
|
||||||
|
nlp = lang_cls()
|
||||||
|
for pipe in pipeline:
|
||||||
|
nlp.add_pipe(nlp.create_pipe(pipe))
|
||||||
|
|
||||||
|
if learn_tokens:
|
||||||
|
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
|
||||||
|
|
||||||
# Take dropout and batch size as generators of values -- dropout
|
# Take dropout and batch size as generators of values -- dropout
|
||||||
# starts high and decays sharply, to force the optimizer to explore.
|
# starts high and decays sharply, to force the optimizer to explore.
|
||||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||||
# at the beginning of training.
|
# at the beginning of training.
|
||||||
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1),
|
dropout_rates = util.decaying(
|
||||||
util.env_opt('dropout_to', 0.1),
|
util.env_opt("dropout_from", 0.1),
|
||||||
util.env_opt('dropout_decay', 0.0))
|
util.env_opt("dropout_to", 0.1),
|
||||||
batch_sizes = util.compounding(util.env_opt('batch_from', 750),
|
util.env_opt("dropout_decay", 0.0),
|
||||||
util.env_opt('batch_to', 750),
|
)
|
||||||
util.env_opt('batch_compound', 1.001))
|
batch_sizes = util.compounding(
|
||||||
|
util.env_opt("batch_from", 750),
|
||||||
|
util.env_opt("batch_to", 750),
|
||||||
|
util.env_opt("batch_compound", 1.001),
|
||||||
|
)
|
||||||
lang_class = util.get_lang_class(lang)
|
lang_class = util.get_lang_class(lang)
|
||||||
nlp = lang_class()
|
nlp = lang_class()
|
||||||
meta['pipeline'] = pipeline
|
meta["pipeline"] = pipeline
|
||||||
nlp.meta.update(meta)
|
nlp.meta.update(meta)
|
||||||
if vectors:
|
if vectors:
|
||||||
print("Load vectors model", vectors)
|
msg.text(Messages.M058.format(model=vectors))
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
_load_vectors(nlp, vectors)
|
||||||
for lex in nlp.vocab:
|
|
||||||
values = {}
|
# Multitask objectives
|
||||||
for attr, func in nlp.vocab.lex_attr_getters.items():
|
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
||||||
# These attrs are expected to be set by data. Others should
|
for pipe_name, multitasks in multitask_options:
|
||||||
# be set by calling the language functions.
|
if multitasks:
|
||||||
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
if pipe_name not in pipeline:
|
||||||
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
msg.fail(Messages.M059.format(pipe=pipe_name))
|
||||||
lex.set_attrs(**values)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
lex.is_oov = False
|
for objective in multitasks.split(","):
|
||||||
for name in pipeline:
|
pipe.add_multitask_objective(objective)
|
||||||
nlp.add_pipe(nlp.create_pipe(name), name=name)
|
|
||||||
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
|
# Prepare training corpus
|
||||||
if parser_multitasks:
|
msg.text(Messages.M060.format(limit=n_examples))
|
||||||
for objective in parser_multitasks.split(','):
|
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||||
nlp.parser.add_multitask_objective(objective)
|
n_train_words = corpus.count_train()
|
||||||
if entity_multitasks:
|
|
||||||
for objective in entity_multitasks.split(','):
|
if base_model:
|
||||||
nlp.entity.add_multitask_objective(objective)
|
# Start with an existing model, use default optimizer
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = create_default_optimizer(Model.ops)
|
||||||
if init_tok2vec is not None:
|
else:
|
||||||
loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
# Start with a blank model, call begin_training
|
||||||
print("Loaded pretrained tok2vec for:", loaded)
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
print("Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS")
|
# Load in pre-trained weights
|
||||||
|
if init_tok2vec is not None:
|
||||||
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
|
msg.text(Messages.M071.format(components=components))
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
train_docs = corpus.train_docs(nlp, noise_level=noise_level,
|
train_docs = corpus.train_docs(
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
|
||||||
|
)
|
||||||
words_seen = 0
|
words_seen = 0
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch_by_words(train_docs, size=batch_sizes):
|
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
||||||
if not batch:
|
if not batch:
|
||||||
continue
|
continue
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer,
|
nlp.update(
|
||||||
drop=next(dropout_rates), losses=losses)
|
docs,
|
||||||
|
golds,
|
||||||
|
sgd=optimizer,
|
||||||
|
drop=next(dropout_rates),
|
||||||
|
losses=losses,
|
||||||
|
)
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
words_seen += sum(len(doc) for doc in docs)
|
words_seen += sum(len(doc) for doc in docs)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
util.set_env_log(False)
|
util.set_env_log(False)
|
||||||
epoch_model_path = output_path / ('model%d' % i)
|
epoch_model_path = output_path / ("model%d" % i)
|
||||||
nlp.to_disk(epoch_model_path)
|
nlp.to_disk(epoch_model_path)
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
dev_docs = list(corpus.dev_docs(
|
dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
|
||||||
nlp_loaded,
|
|
||||||
gold_preproc=gold_preproc))
|
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs, verbose)
|
scorer = nlp_loaded.evaluate(dev_docs, debug)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
if use_gpu < 0:
|
if use_gpu < 0:
|
||||||
gpu_wps = None
|
gpu_wps = None
|
||||||
cpu_wps = nwords/(end_time-start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
else:
|
else:
|
||||||
gpu_wps = nwords/(end_time-start_time)
|
gpu_wps = nwords / (end_time - start_time)
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device("cpu"):
|
||||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||||
dev_docs = list(corpus.dev_docs(
|
dev_docs = list(
|
||||||
nlp_loaded, gold_preproc=gold_preproc))
|
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
|
||||||
|
)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs)
|
scorer = nlp_loaded.evaluate(dev_docs)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords/(end_time-start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
|
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||||
with acc_loc.open('w') as file_:
|
util.write_json(acc_loc, scorer.scores)
|
||||||
file_.write(json_dumps(scorer.scores))
|
|
||||||
meta_loc = output_path / ('model%d' % i) / 'meta.json'
|
|
||||||
meta['accuracy'] = scorer.scores
|
|
||||||
meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
|
|
||||||
'gpu': gpu_wps}
|
|
||||||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
|
||||||
'vectors': len(nlp.vocab.vectors),
|
|
||||||
'keys': nlp.vocab.vectors.n_keys}
|
|
||||||
meta['lang'] = nlp.lang
|
|
||||||
meta['pipeline'] = pipeline
|
|
||||||
meta['spacy_version'] = '>=%s' % about.__version__
|
|
||||||
meta.setdefault('name', 'model%d' % i)
|
|
||||||
meta.setdefault('version', version)
|
|
||||||
|
|
||||||
with meta_loc.open('w') as file_:
|
# Update model meta.json
|
||||||
file_.write(json_dumps(meta))
|
meta["lang"] = nlp.lang
|
||||||
util.set_env_log(True)
|
meta["pipeline"] = nlp.pipe_names
|
||||||
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
|
meta["spacy_version"] = ">=%s" % about.__version__
|
||||||
gpu_wps=gpu_wps)
|
meta["accuracy"] = scorer.scores
|
||||||
|
meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
|
||||||
|
meta["vectors"] = {
|
||||||
|
"width": nlp.vocab.vectors_length,
|
||||||
|
"vectors": len(nlp.vocab.vectors),
|
||||||
|
"keys": nlp.vocab.vectors.n_keys,
|
||||||
|
}
|
||||||
|
meta.setdefault("name", "model%d" % i)
|
||||||
|
meta.setdefault("version", version)
|
||||||
|
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||||
|
util.write_json(meta_loc, meta)
|
||||||
|
|
||||||
|
util.set_env_log(verbose)
|
||||||
|
|
||||||
|
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
|
||||||
finally:
|
finally:
|
||||||
print("Saving model...")
|
with msg.loading(Messages.M061):
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
final_model_path = output_path / 'model-final'
|
final_model_path = output_path / "model-final"
|
||||||
nlp.to_disk(final_model_path)
|
nlp.to_disk(final_model_path)
|
||||||
components = []
|
msg.good(Messages.M066, util.path2str(final_model_path))
|
||||||
if not no_parser:
|
|
||||||
components.append('parser')
|
_collate_best_model(meta, output_path, nlp.pipe_names)
|
||||||
if not no_tagger:
|
|
||||||
components.append('tagger')
|
|
||||||
if not no_entities:
|
def _load_vectors(nlp, vectors):
|
||||||
components.append('ner')
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
_collate_best_model(meta, output_path, components)
|
for lex in nlp.vocab:
|
||||||
|
values = {}
|
||||||
|
for attr, func in nlp.vocab.lex_attr_getters.items():
|
||||||
|
# These attrs are expected to be set by data. Others should
|
||||||
|
# be set by calling the language functions.
|
||||||
|
if attr not in (CLUSTER, PROB, IS_OOV, LANG):
|
||||||
|
values[lex.vocab.strings[attr]] = func(lex.orth_)
|
||||||
|
lex.set_attrs(**values)
|
||||||
|
lex.is_oov = False
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with loc.open('rb') as file_:
|
with loc.open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.tok2vec.from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
|
||||||
bests = {}
|
bests = {}
|
||||||
for component in components:
|
for component in components:
|
||||||
bests[component] = _find_best(output_path, component)
|
bests[component] = _find_best(output_path, component)
|
||||||
best_dest = output_path / 'model-best'
|
best_dest = output_path / "model-best"
|
||||||
shutil.copytree(output_path / 'model-final', best_dest)
|
shutil.copytree(output_path / "model-final", best_dest)
|
||||||
for component, best_component_src in bests.items():
|
for component, best_component_src in bests.items():
|
||||||
shutil.rmtree(best_dest / component)
|
shutil.rmtree(best_dest / component)
|
||||||
shutil.copytree(best_component_src / component, best_dest / component)
|
shutil.copytree(best_component_src / component, best_dest / component)
|
||||||
with (best_component_src / 'accuracy.json').open() as file_:
|
accs = util.read_json(best_component_src / "accuracy.json")
|
||||||
accs = json.load(file_)
|
|
||||||
for metric in _get_metrics(component):
|
for metric in _get_metrics(component):
|
||||||
meta['accuracy'][metric] = accs[metric]
|
meta["accuracy"][metric] = accs[metric]
|
||||||
with (best_dest / 'meta.json').open('w') as file_:
|
util.write_json(best_dest / "meta.json", meta)
|
||||||
file_.write(json_dumps(meta))
|
|
||||||
|
|
||||||
|
|
||||||
def _find_best(experiment_dir, component):
|
def _find_best(experiment_dir, component):
|
||||||
accuracies = []
|
accuracies = []
|
||||||
for epoch_model in experiment_dir.iterdir():
|
for epoch_model in experiment_dir.iterdir():
|
||||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||||
accs = json.load((epoch_model / "accuracy.json").open())
|
accs = util.read_json(epoch_model / "accuracy.json")
|
||||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||||
accuracies.append((scores, epoch_model))
|
accuracies.append((scores, epoch_model))
|
||||||
if accuracies:
|
if accuracies:
|
||||||
|
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_metrics(component):
|
def _get_metrics(component):
|
||||||
if component == "parser":
|
if component == "parser":
|
||||||
return ("las", "uas", "token_acc")
|
return ("las", "uas", "token_acc")
|
||||||
|
@ -257,50 +339,40 @@ def _get_metrics(component):
|
||||||
return ("token_acc",)
|
return ("token_acc",)
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
|
||||||
with Path('/tmp/entities.html').open('w') as file_:
|
|
||||||
html = displacy.render(to_render[:5], style='ent', page=True)
|
|
||||||
file_.write(html)
|
|
||||||
with Path('/tmp/parses.html').open('w') as file_:
|
|
||||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
|
||||||
file_.write(html)
|
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
|
||||||
scores = {}
|
scores = {}
|
||||||
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
|
for col in [
|
||||||
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']:
|
"dep_loss",
|
||||||
|
"tag_loss",
|
||||||
|
"uas",
|
||||||
|
"tags_acc",
|
||||||
|
"token_acc",
|
||||||
|
"ents_p",
|
||||||
|
"ents_r",
|
||||||
|
"ents_f",
|
||||||
|
"cpu_wps",
|
||||||
|
"gpu_wps",
|
||||||
|
]:
|
||||||
scores[col] = 0.0
|
scores[col] = 0.0
|
||||||
scores['dep_loss'] = losses.get('parser', 0.0)
|
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||||
scores['ner_loss'] = losses.get('ner', 0.0)
|
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||||
scores['tag_loss'] = losses.get('tagger', 0.0)
|
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||||
scores.update(dev_scores)
|
scores.update(dev_scores)
|
||||||
scores['cpu_wps'] = cpu_wps
|
scores["cpu_wps"] = cpu_wps
|
||||||
scores['gpu_wps'] = gpu_wps or 0.0
|
scores["gpu_wps"] = gpu_wps or 0.0
|
||||||
tpl = ''.join((
|
tpl = "".join(
|
||||||
'{:<6d}',
|
(
|
||||||
'{dep_loss:<10.3f}',
|
"{:<6d}",
|
||||||
'{ner_loss:<10.3f}',
|
"{dep_loss:<10.3f}",
|
||||||
'{uas:<8.3f}',
|
"{ner_loss:<10.3f}",
|
||||||
'{ents_p:<8.3f}',
|
"{uas:<8.3f}",
|
||||||
'{ents_r:<8.3f}',
|
"{ents_p:<8.3f}",
|
||||||
'{ents_f:<8.3f}',
|
"{ents_r:<8.3f}",
|
||||||
'{tags_acc:<8.3f}',
|
"{ents_f:<8.3f}",
|
||||||
'{token_acc:<9.3f}',
|
"{tags_acc:<8.3f}",
|
||||||
'{cpu_wps:<9.1f}',
|
"{token_acc:<9.3f}",
|
||||||
'{gpu_wps:.1f}',
|
"{cpu_wps:<9.1f}",
|
||||||
))
|
"{gpu_wps:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **scores))
|
print(tpl.format(itn, **scores))
|
||||||
|
|
||||||
|
|
||||||
def print_results(scorer):
|
|
||||||
results = {
|
|
||||||
'TOK': '%.2f' % scorer.token_acc,
|
|
||||||
'POS': '%.2f' % scorer.tags_acc,
|
|
||||||
'UAS': '%.2f' % scorer.uas,
|
|
||||||
'LAS': '%.2f' % scorer.las,
|
|
||||||
'NER P': '%.2f' % scorer.ents_p,
|
|
||||||
'NER R': '%.2f' % scorer.ents_r,
|
|
||||||
'NER F': '%.2f' % scorer.ents_f}
|
|
||||||
util.print_table(results, title="Results")
|
|
||||||
|
|
2
spacy/cli/ud/__init__.py
Normal file
2
spacy/cli/ud/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
from .conll17_ud_eval import main as ud_evaluate # noqa: F401
|
||||||
|
from .ud_train import main as ud_train # noqa: F401
|
|
@ -1,4 +1,5 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# flake8: noqa
|
||||||
|
|
||||||
# CoNLL 2017 UD Parsing evaluation script.
|
# CoNLL 2017 UD Parsing evaluation script.
|
||||||
#
|
#
|
||||||
|
@ -214,7 +215,7 @@ def load_conllu(file):
|
||||||
start, end = map(int, columns[ID].split("-"))
|
start, end = map(int, columns[ID].split("-"))
|
||||||
except:
|
except:
|
||||||
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))
|
||||||
|
|
||||||
for _ in range(start, end + 1):
|
for _ in range(start, end + 1):
|
||||||
word_line = file.readline().rstrip("\r\n")
|
word_line = file.readline().rstrip("\r\n")
|
||||||
word_columns = word_line.split("\t")
|
word_columns = word_line.split("\t")
|
|
@ -1,7 +1,9 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
# flake8: noqa
|
||||||
|
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
.conllu format for development data, allowing the official scorer to be used.
|
||||||
'''
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,15 +13,17 @@ import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from ..tokens import Token, Doc
|
from ...tokens import Token, Doc
|
||||||
from ..gold import GoldParse
|
from ...gold import GoldParse
|
||||||
from ..util import compounding, minibatch_by_words
|
from ...util import compounding, minibatch_by_words
|
||||||
from ..syntax.nonproj import projectivize
|
from ...syntax.nonproj import projectivize
|
||||||
from ..matcher import Matcher
|
from ...matcher import Matcher
|
||||||
#from ..morphology import Fused_begin, Fused_inside
|
|
||||||
from .. import displacy
|
# from ...morphology import Fused_begin, Fused_inside
|
||||||
|
from ... import displacy
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
Fused_begin = None
|
Fused_begin = None
|
||||||
Fused_inside = None
|
Fused_inside = None
|
||||||
|
|
||||||
|
@ -30,43 +34,45 @@ import cytoolz
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
from .. import lang
|
from ... import lang
|
||||||
from .. import lang
|
from ...lang import zh
|
||||||
from ..lang import zh
|
from ...lang import ja
|
||||||
from ..lang import ja
|
from ...lang import ru
|
||||||
from ..lang import ru
|
|
||||||
|
|
||||||
|
|
||||||
################
|
################
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
space_re = re.compile("\s+")
|
||||||
|
|
||||||
|
|
||||||
def split_text(text):
|
def split_text(text):
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||||
|
|
||||||
|
|
||||||
##############
|
##############
|
||||||
# Evaluation #
|
# Evaluation #
|
||||||
##############
|
##############
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
def read_conllu(file_):
|
||||||
docs = []
|
docs = []
|
||||||
sent = []
|
sent = []
|
||||||
doc = []
|
doc = []
|
||||||
for line in file_:
|
for line in file_:
|
||||||
if line.startswith('# newdoc'):
|
if line.startswith("# newdoc"):
|
||||||
if doc:
|
if doc:
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
doc = []
|
doc = []
|
||||||
elif line.startswith('#'):
|
elif line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
elif not line.strip():
|
elif not line.strip():
|
||||||
if sent:
|
if sent:
|
||||||
doc.append(sent)
|
doc.append(sent)
|
||||||
sent = []
|
sent = []
|
||||||
else:
|
else:
|
||||||
sent.append(list(line.strip().split('\t')))
|
sent.append(list(line.strip().split("\t")))
|
||||||
if len(sent[-1]) != 10:
|
if len(sent[-1]) != 10:
|
||||||
print(repr(line))
|
print(repr(line))
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
@ -78,7 +84,7 @@ def read_conllu(file_):
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
if text_loc.parts[-1].endswith('.conllu'):
|
if text_loc.parts[-1].endswith(".conllu"):
|
||||||
docs = []
|
docs = []
|
||||||
with text_loc.open() as file_:
|
with text_loc.open() as file_:
|
||||||
for conllu_doc in read_conllu(file_):
|
for conllu_doc in read_conllu(file_):
|
||||||
|
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
docs = list(component.pipe(docs))
|
docs = list(component.pipe(docs))
|
||||||
else:
|
else:
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
with text_loc.open("r", encoding="utf8") as text_file:
|
||||||
texts = split_text(text_file.read())
|
texts = split_text(text_file.read())
|
||||||
docs = list(nlp.pipe(texts))
|
docs = list(nlp.pipe(texts))
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||||
write_conllu(docs, out_file)
|
write_conllu(docs, out_file)
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||||
return docs, scores
|
return docs, scores
|
||||||
|
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||||
for start_char, end_char in offsets:
|
for start_char, end_char in offsets:
|
||||||
doc.merge(start_char, end_char)
|
doc.merge(start_char, end_char)
|
||||||
# TODO: This shuldn't be necessary? Should be handled in merge
|
# TODO: This shuldn't be necessary? Should be handled in merge
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i == word.head.i:
|
if word.i == word.head.i:
|
||||||
word.dep_ = 'ROOT'
|
word.dep_ = "ROOT"
|
||||||
file_.write("# newdoc id = {i}\n".format(i=i))
|
file_.write("# newdoc id = {i}\n".format(i=i))
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
file_.write("# text = {text}\n".format(text=sent.text))
|
||||||
for k, token in enumerate(sent):
|
for k, token in enumerate(sent):
|
||||||
file_.write(_get_token_conllu(token, k, len(sent)) + '\n')
|
file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
|
||||||
file_.write('\n')
|
file_.write("\n")
|
||||||
for word in sent:
|
for word in sent:
|
||||||
if word.head.i == word.i and word.dep_ == 'ROOT':
|
if word.head.i == word.i and word.dep_ == "ROOT":
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print("Rootless sentence!")
|
print("Rootless sentence!")
|
||||||
|
@ -134,24 +140,34 @@ def write_conllu(docs, file_):
|
||||||
|
|
||||||
|
|
||||||
def _get_token_conllu(token, k, sent_len):
|
def _get_token_conllu(token, k, sent_len):
|
||||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||||
n = 1
|
n = 1
|
||||||
text = [token.text]
|
text = [token.text]
|
||||||
while token.nbor(n).check_morph(Fused_inside):
|
while token.nbor(n).check_morph(Fused_inside):
|
||||||
text.append(token.nbor(n).text)
|
text.append(token.nbor(n).text)
|
||||||
n += 1
|
n += 1
|
||||||
id_ = '%d-%d' % (k+1, (k+n))
|
id_ = "%d-%d" % (k + 1, (k + n))
|
||||||
fields = [id_, ''.join(text)] + ['_'] * 8
|
fields = [id_, "".join(text)] + ["_"] * 8
|
||||||
lines = ['\t'.join(fields)]
|
lines = ["\t".join(fields)]
|
||||||
else:
|
else:
|
||||||
lines = []
|
lines = []
|
||||||
if token.head.i == token.i:
|
if token.head.i == token.i:
|
||||||
head = 0
|
head = 0
|
||||||
else:
|
else:
|
||||||
head = k + (token.head.i - token.i) + 1
|
head = k + (token.head.i - token.i) + 1
|
||||||
fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
fields = [
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
str(k + 1),
|
||||||
if token.check_morph(Fused_begin) and (k+1 < sent_len):
|
token.text,
|
||||||
|
token.lemma_,
|
||||||
|
token.pos_,
|
||||||
|
token.tag_,
|
||||||
|
"_",
|
||||||
|
str(head),
|
||||||
|
token.dep_.lower(),
|
||||||
|
"_",
|
||||||
|
"_",
|
||||||
|
]
|
||||||
|
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
|
||||||
if k == 0:
|
if k == 0:
|
||||||
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
fields[1] = token.norm_[0].upper() + token.norm_[1:]
|
||||||
else:
|
else:
|
||||||
|
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
|
||||||
split_end = token._.split_end
|
split_end = token._.split_end
|
||||||
split_len = (split_end.i - split_start.i) + 1
|
split_len = (split_end.i - split_start.i) + 1
|
||||||
n_in_split = token.i - split_start.i
|
n_in_split = token.i - split_start.i
|
||||||
subtokens = guess_fused_orths(split_start.text, [''] * split_len)
|
subtokens = guess_fused_orths(split_start.text, [""] * split_len)
|
||||||
fields[1] = subtokens[n_in_split]
|
fields[1] = subtokens[n_in_split]
|
||||||
|
|
||||||
lines.append('\t'.join(fields))
|
lines.append("\t".join(fields))
|
||||||
return '\n'.join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def guess_fused_orths(word, ud_forms):
|
def guess_fused_orths(word, ud_forms):
|
||||||
'''The UD data 'fused tokens' don't necessarily expand to keys that match
|
"""The UD data 'fused tokens' don't necessarily expand to keys that match
|
||||||
the form. We need orths that exact match the string. Here we make a best
|
the form. We need orths that exact match the string. Here we make a best
|
||||||
effort to divide up the word.'''
|
effort to divide up the word."""
|
||||||
if word == ''.join(ud_forms):
|
if word == "".join(ud_forms):
|
||||||
# Happy case: we get a perfect split, with each letter accounted for.
|
# Happy case: we get a perfect split, with each letter accounted for.
|
||||||
return ud_forms
|
return ud_forms
|
||||||
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
|
||||||
|
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
|
||||||
remain = word
|
remain = word
|
||||||
for subtoken in ud_forms:
|
for subtoken in ud_forms:
|
||||||
assert len(subtoken) >= 1
|
assert len(subtoken) >= 1
|
||||||
output.append(remain[:len(subtoken)])
|
output.append(remain[: len(subtoken)])
|
||||||
remain = remain[len(subtoken):]
|
remain = remain[len(subtoken) :]
|
||||||
assert len(remain) == 0, (word, ud_forms, remain)
|
assert len(remain) == 0, (word, ud_forms, remain)
|
||||||
return output
|
return output
|
||||||
else:
|
else:
|
||||||
# Let's say word is 6 long, and there are three subtokens. The orths
|
# Let's say word is 6 long, and there are three subtokens. The orths
|
||||||
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
# *must* equal the original string. Arbitrarily, split [4, 1, 1]
|
||||||
first = word[:len(word)-(len(ud_forms)-1)]
|
first = word[: len(word) - (len(ud_forms) - 1)]
|
||||||
output = [first]
|
output = [first]
|
||||||
remain = word[len(first):]
|
remain = word[len(first) :]
|
||||||
for i in range(1, len(ud_forms)):
|
for i in range(1, len(ud_forms)):
|
||||||
assert remain
|
assert remain
|
||||||
output.append(remain[:1])
|
output.append(remain[:1])
|
||||||
|
@ -201,60 +217,50 @@ def guess_fused_orths(word, ud_forms):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(name, ud_scores):
|
def print_results(name, ud_scores):
|
||||||
fields = {}
|
fields = {}
|
||||||
if ud_scores is not None:
|
if ud_scores is not None:
|
||||||
fields.update({
|
fields.update(
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
{
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
"words": ud_scores["Words"].f1 * 100,
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
"sents": ud_scores["Sentences"].f1 * 100,
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
"tags": ud_scores["XPOS"].f1 * 100,
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
"uas": ud_scores["UAS"].f1 * 100,
|
||||||
})
|
"las": ud_scores["LAS"].f1 * 100,
|
||||||
|
}
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
fields.update({
|
fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
|
||||||
'words': 0.0,
|
tpl = "\t".join(
|
||||||
'sents': 0.0,
|
(name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
|
||||||
'tags': 0.0,
|
)
|
||||||
'uas': 0.0,
|
|
||||||
'las': 0.0
|
|
||||||
})
|
|
||||||
tpl = '\t'.join((
|
|
||||||
name,
|
|
||||||
'{las:.1f}',
|
|
||||||
'{uas:.1f}',
|
|
||||||
'{tags:.1f}',
|
|
||||||
'{sents:.1f}',
|
|
||||||
'{words:.1f}',
|
|
||||||
))
|
|
||||||
print(tpl.format(**fields))
|
print(tpl.format(**fields))
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
|
|
||||||
def get_token_split_start(token):
|
def get_token_split_start(token):
|
||||||
if token.text == '':
|
if token.text == "":
|
||||||
assert token.i != 0
|
assert token.i != 0
|
||||||
i = -1
|
i = -1
|
||||||
while token.nbor(i).text == '':
|
while token.nbor(i).text == "":
|
||||||
i -= 1
|
i -= 1
|
||||||
return token.nbor(i)
|
return token.nbor(i)
|
||||||
elif (token.i+1) < len(token.doc) and token.nbor(1).text == '':
|
elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
|
||||||
return token
|
return token
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_token_split_end(token):
|
def get_token_split_end(token):
|
||||||
if (token.i+1) == len(token.doc):
|
if (token.i + 1) == len(token.doc):
|
||||||
return token if token.text == '' else None
|
return token if token.text == "" else None
|
||||||
elif token.text != '' and token.nbor(1).text != '':
|
elif token.text != "" and token.nbor(1).text != "":
|
||||||
return None
|
return None
|
||||||
i = 1
|
i = 1
|
||||||
while (token.i+i) < len(token.doc) and token.nbor(i).text == '':
|
while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
|
||||||
i += 1
|
i += 1
|
||||||
return token.nbor(i-1)
|
return token.nbor(i - 1)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# Initialization #
|
# Initialization #
|
||||||
|
@ -262,54 +268,73 @@ def get_token_split_end(token):
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(experiments_dir, corpus):
|
def load_nlp(experiments_dir, corpus):
|
||||||
nlp = spacy.load(experiments_dir / corpus / 'best-model')
|
nlp = spacy.load(experiments_dir / corpus / "best-model")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path),
|
test_data_dir=(
|
||||||
|
"Path to Universal Dependencies test data",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
Path,
|
||||||
|
),
|
||||||
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
experiment_dir=("Parent directory with output model", "positional", None, Path),
|
||||||
corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
|
corpus=(
|
||||||
|
"UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def main(test_data_dir, experiment_dir, corpus):
|
def main(test_data_dir, experiment_dir, corpus):
|
||||||
Token.set_extension('split_start', getter=get_token_split_start)
|
Token.set_extension("split_start", getter=get_token_split_start)
|
||||||
Token.set_extension('split_end', getter=get_token_split_end)
|
Token.set_extension("split_end", getter=get_token_split_end)
|
||||||
Token.set_extension('begins_fused', default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
Token.set_extension('inside_fused', default=False)
|
Token.set_extension("inside_fused", default=False)
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
||||||
|
|
||||||
nlp = load_nlp(experiment_dir, corpus)
|
nlp = load_nlp(experiment_dir, corpus)
|
||||||
|
|
||||||
treebank_code = nlp.meta['treebank']
|
|
||||||
for section in ('test', 'dev'):
|
|
||||||
if section == 'dev':
|
|
||||||
section_dir = 'conll17-ud-development-2017-03-19'
|
|
||||||
else:
|
|
||||||
section_dir = 'conll17-ud-test-2017-05-09'
|
|
||||||
text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt')
|
|
||||||
udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu')
|
|
||||||
gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu')
|
|
||||||
|
|
||||||
header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
treebank_code = nlp.meta["treebank"]
|
||||||
print('\t'.join(header))
|
for section in ("test", "dev"):
|
||||||
inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path}
|
if section == "dev":
|
||||||
for input_type in ('udp', 'raw'):
|
section_dir = "conll17-ud-development-2017-03-19"
|
||||||
|
else:
|
||||||
|
section_dir = "conll17-ud-test-2017-05-09"
|
||||||
|
text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
|
||||||
|
udpipe_path = (
|
||||||
|
test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
|
||||||
|
)
|
||||||
|
gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
|
||||||
|
|
||||||
|
header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||||
|
print("\t".join(header))
|
||||||
|
inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
|
||||||
|
for input_type in ("udp", "raw"):
|
||||||
input_path = inputs[input_type]
|
input_path = inputs[input_type]
|
||||||
output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section)
|
output_path = (
|
||||||
|
experiment_dir / corpus / "{section}.conllu".format(section=section)
|
||||||
|
)
|
||||||
|
|
||||||
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
|
||||||
|
|
||||||
accuracy = print_results(input_type, test_scores)
|
accuracy = print_results(input_type, test_scores)
|
||||||
acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section)
|
acc_path = (
|
||||||
with open(acc_path, 'w') as file_:
|
experiment_dir
|
||||||
|
/ corpus
|
||||||
|
/ "{section}-accuracy.json".format(section=section)
|
||||||
|
)
|
||||||
|
with open(acc_path, "w") as file_:
|
||||||
file_.write(json.dumps(accuracy, indent=2))
|
file_.write(json.dumps(accuracy, indent=2))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
|
@ -1,7 +1,9 @@
|
||||||
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
# flake8: noqa
|
||||||
|
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
|
||||||
.conllu format for development data, allowing the official scorer to be used.
|
.conllu format for development data, allowing the official scorer to be used.
|
||||||
'''
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -11,12 +13,12 @@ import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from ..tokens import Token, Doc
|
from ...tokens import Token, Doc
|
||||||
from ..gold import GoldParse
|
from ...gold import GoldParse
|
||||||
from ..util import compounding, minibatch, minibatch_by_words
|
from ...util import compounding, minibatch, minibatch_by_words
|
||||||
from ..syntax.nonproj import projectivize
|
from ...syntax.nonproj import projectivize
|
||||||
from ..matcher import Matcher
|
from ...matcher import Matcher
|
||||||
from .. import displacy
|
from ... import displacy
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
|
|
||||||
|
@ -27,10 +29,9 @@ import cytoolz
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
from .. import lang
|
from ... import lang
|
||||||
from .. import lang
|
from ...lang import zh
|
||||||
from ..lang import zh
|
from ...lang import ja
|
||||||
from ..lang import ja
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
|
@ -42,17 +43,26 @@ except ImportError:
|
||||||
# Data reading #
|
# Data reading #
|
||||||
################
|
################
|
||||||
|
|
||||||
space_re = re.compile('\s+')
|
space_re = re.compile("\s+")
|
||||||
def split_text(text):
|
|
||||||
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
|
|
||||||
|
|
||||||
|
|
||||||
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
|
||||||
max_doc_length=None, limit=None):
|
def split_text(text):
|
||||||
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(
|
||||||
|
nlp,
|
||||||
|
conllu_file,
|
||||||
|
text_file,
|
||||||
|
raw_text=True,
|
||||||
|
oracle_segments=False,
|
||||||
|
max_doc_length=None,
|
||||||
|
limit=None,
|
||||||
|
):
|
||||||
|
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
include Doc objects created using nlp.make_doc and then aligned against
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||||
created from the gold-standard segments. At least one must be True.'''
|
created from the gold-standard segments. At least one must be True."""
|
||||||
if not raw_text and not oracle_segments:
|
if not raw_text and not oracle_segments:
|
||||||
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
raise ValueError("At least one of raw_text or oracle_segments must be True")
|
||||||
paragraphs = split_text(text_file.read())
|
paragraphs = split_text(text_file.read())
|
||||||
|
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
|
||||||
for cs in cd:
|
for cs in cd:
|
||||||
sent = defaultdict(list)
|
sent = defaultdict(list)
|
||||||
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
|
||||||
if '.' in id_:
|
if "." in id_:
|
||||||
continue
|
continue
|
||||||
if '-' in id_:
|
if "-" in id_:
|
||||||
continue
|
continue
|
||||||
id_ = int(id_)-1
|
id_ = int(id_) - 1
|
||||||
head = int(head)-1 if head != '0' else id_
|
head = int(head) - 1 if head != "0" else id_
|
||||||
sent['words'].append(word)
|
sent["words"].append(word)
|
||||||
sent['tags'].append(tag)
|
sent["tags"].append(tag)
|
||||||
sent['heads'].append(head)
|
sent["heads"].append(head)
|
||||||
sent['deps'].append('ROOT' if dep == 'root' else dep)
|
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||||
sent['spaces'].append(space_after == '_')
|
sent["spaces"].append(space_after == "_")
|
||||||
sent['entities'] = ['-'] * len(sent['words'])
|
sent["entities"] = ["-"] * len(sent["words"])
|
||||||
sent['heads'], sent['deps'] = projectivize(sent['heads'],
|
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||||
sent['deps'])
|
|
||||||
if oracle_segments:
|
if oracle_segments:
|
||||||
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
|
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||||
golds.append(GoldParse(docs[-1], **sent))
|
golds.append(GoldParse(docs[-1], **sent))
|
||||||
|
|
||||||
sent_annots.append(sent)
|
sent_annots.append(sent)
|
||||||
|
@ -107,18 +116,18 @@ def read_conllu(file_):
|
||||||
sent = []
|
sent = []
|
||||||
doc = []
|
doc = []
|
||||||
for line in file_:
|
for line in file_:
|
||||||
if line.startswith('# newdoc'):
|
if line.startswith("# newdoc"):
|
||||||
if doc:
|
if doc:
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
doc = []
|
doc = []
|
||||||
elif line.startswith('#'):
|
elif line.startswith("#"):
|
||||||
continue
|
continue
|
||||||
elif not line.strip():
|
elif not line.strip():
|
||||||
if sent:
|
if sent:
|
||||||
doc.append(sent)
|
doc.append(sent)
|
||||||
sent = []
|
sent = []
|
||||||
else:
|
else:
|
||||||
sent.append(list(line.strip().split('\t')))
|
sent.append(list(line.strip().split("\t")))
|
||||||
if len(sent[-1]) != 10:
|
if len(sent[-1]) != 10:
|
||||||
print(repr(line))
|
print(repr(line))
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
flat = defaultdict(list)
|
flat = defaultdict(list)
|
||||||
sent_starts = []
|
sent_starts = []
|
||||||
for sent in sent_annots:
|
for sent in sent_annots:
|
||||||
flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
|
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||||
for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
|
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||||
flat[field].extend(sent[field])
|
flat[field].extend(sent[field])
|
||||||
sent_starts.append(True)
|
sent_starts.append(True)
|
||||||
sent_starts.extend([False] * (len(sent['words'])-1))
|
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||||
# Construct text if necessary
|
# Construct text if necessary
|
||||||
assert len(flat['words']) == len(flat['spaces'])
|
assert len(flat["words"]) == len(flat["spaces"])
|
||||||
if text is None:
|
if text is None:
|
||||||
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))
|
text = "".join(
|
||||||
|
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||||
|
)
|
||||||
doc = nlp.make_doc(text)
|
doc = nlp.make_doc(text)
|
||||||
flat.pop('spaces')
|
flat.pop("spaces")
|
||||||
gold = GoldParse(doc, **flat)
|
gold = GoldParse(doc, **flat)
|
||||||
gold.sent_starts = sent_starts
|
gold.sent_starts = sent_starts
|
||||||
for i in range(len(gold.heads)):
|
for i in range(len(gold.heads)):
|
||||||
|
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
|
|
||||||
return doc, gold
|
return doc, gold
|
||||||
|
|
||||||
|
|
||||||
#############################
|
#############################
|
||||||
# Data transforms for spaCy #
|
# Data transforms for spaCy #
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
def golds_to_gold_tuples(docs, golds):
|
||||||
'''Get out the annoying 'tuples' format used by begin_training, given the
|
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||||
GoldParse objects.'''
|
GoldParse objects."""
|
||||||
tuples = []
|
tuples = []
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
text = doc.text
|
text = doc.text
|
||||||
|
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
|
||||||
# Evaluation #
|
# Evaluation #
|
||||||
##############
|
##############
|
||||||
|
|
||||||
|
|
||||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
if text_loc.parts[-1].endswith('.conllu'):
|
if text_loc.parts[-1].endswith(".conllu"):
|
||||||
docs = []
|
docs = []
|
||||||
with text_loc.open() as file_:
|
with text_loc.open() as file_:
|
||||||
for conllu_doc in read_conllu(file_):
|
for conllu_doc in read_conllu(file_):
|
||||||
|
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
docs = list(component.pipe(docs))
|
docs = list(component.pipe(docs))
|
||||||
else:
|
else:
|
||||||
with text_loc.open('r', encoding='utf8') as text_file:
|
with text_loc.open("r", encoding="utf8") as text_file:
|
||||||
texts = split_text(text_file.read())
|
texts = split_text(text_file.read())
|
||||||
docs = list(nlp.pipe(texts))
|
docs = list(nlp.pipe(texts))
|
||||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
with sys_loc.open("w", encoding="utf8") as out_file:
|
||||||
write_conllu(docs, out_file)
|
write_conllu(docs, out_file)
|
||||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
with gold_loc.open("r", encoding="utf8") as gold_file:
|
||||||
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
gold_ud = conll17_ud_eval.load_conllu(gold_file)
|
||||||
with sys_loc.open('r', encoding='utf8') as sys_file:
|
with sys_loc.open("r", encoding="utf8") as sys_file:
|
||||||
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
sys_ud = conll17_ud_eval.load_conllu(sys_file)
|
||||||
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
|
||||||
return docs, scores
|
return docs, scores
|
||||||
|
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||||
|
|
||||||
def write_conllu(docs, file_):
|
def write_conllu(docs, file_):
|
||||||
merger = Matcher(docs[0].vocab)
|
merger = Matcher(docs[0].vocab)
|
||||||
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
|
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
matches = merger(doc)
|
matches = merger(doc)
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||||
for start_char, end_char in offsets:
|
for start_char, end_char in offsets:
|
||||||
doc.merge(start_char, end_char)
|
doc.merge(start_char, end_char)
|
||||||
|
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
|
||||||
file_.write("# text = {text}\n".format(text=sent.text))
|
file_.write("# text = {text}\n".format(text=sent.text))
|
||||||
for k, token in enumerate(sent):
|
for k, token in enumerate(sent):
|
||||||
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
if token.head.i > sent[-1].i or token.head.i < sent[0].i:
|
||||||
for word in doc[sent[0].i-10 : sent[0].i]:
|
for word in doc[sent[0].i - 10 : sent[0].i]:
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
print(word.i, word.head.i, word.text, word.dep_)
|
||||||
for word in sent:
|
for word in sent:
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
print(word.i, word.head.i, word.text, word.dep_)
|
||||||
for word in doc[sent[-1].i : sent[-1].i+10]:
|
for word in doc[sent[-1].i : sent[-1].i + 10]:
|
||||||
print(word.i, word.head.i, word.text, word.dep_)
|
print(word.i, word.head.i, word.text, word.dep_)
|
||||||
raise ValueError("Invalid parse: head outside sentence (%s)" % token.text)
|
raise ValueError(
|
||||||
file_.write(token._.get_conllu_lines(k) + '\n')
|
"Invalid parse: head outside sentence (%s)" % token.text
|
||||||
file_.write('\n')
|
)
|
||||||
|
file_.write(token._.get_conllu_lines(k) + "\n")
|
||||||
|
file_.write("\n")
|
||||||
|
|
||||||
|
|
||||||
def print_progress(itn, losses, ud_scores):
|
def print_progress(itn, losses, ud_scores):
|
||||||
fields = {
|
fields = {
|
||||||
'dep_loss': losses.get('parser', 0.0),
|
"dep_loss": losses.get("parser", 0.0),
|
||||||
'tag_loss': losses.get('tagger', 0.0),
|
"tag_loss": losses.get("tagger", 0.0),
|
||||||
'words': ud_scores['Words'].f1 * 100,
|
"words": ud_scores["Words"].f1 * 100,
|
||||||
'sents': ud_scores['Sentences'].f1 * 100,
|
"sents": ud_scores["Sentences"].f1 * 100,
|
||||||
'tags': ud_scores['XPOS'].f1 * 100,
|
"tags": ud_scores["XPOS"].f1 * 100,
|
||||||
'uas': ud_scores['UAS'].f1 * 100,
|
"uas": ud_scores["UAS"].f1 * 100,
|
||||||
'las': ud_scores['LAS'].f1 * 100,
|
"las": ud_scores["LAS"].f1 * 100,
|
||||||
}
|
}
|
||||||
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
|
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
|
||||||
if itn == 0:
|
if itn == 0:
|
||||||
print('\t'.join(header))
|
print("\t".join(header))
|
||||||
tpl = '\t'.join((
|
tpl = "\t".join(
|
||||||
'{:d}',
|
(
|
||||||
'{dep_loss:.1f}',
|
"{:d}",
|
||||||
'{las:.1f}',
|
"{dep_loss:.1f}",
|
||||||
'{uas:.1f}',
|
"{las:.1f}",
|
||||||
'{tags:.1f}',
|
"{uas:.1f}",
|
||||||
'{sents:.1f}',
|
"{tags:.1f}",
|
||||||
'{words:.1f}',
|
"{sents:.1f}",
|
||||||
))
|
"{words:.1f}",
|
||||||
|
)
|
||||||
|
)
|
||||||
print(tpl.format(itn, **fields))
|
print(tpl.format(itn, **fields))
|
||||||
|
|
||||||
#def get_sent_conllu(sent, sent_id):
|
|
||||||
|
# def get_sent_conllu(sent, sent_id):
|
||||||
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
|
||||||
|
|
||||||
|
|
||||||
def get_token_conllu(token, i):
|
def get_token_conllu(token, i):
|
||||||
if token._.begins_fused:
|
if token._.begins_fused:
|
||||||
n = 1
|
n = 1
|
||||||
while token.nbor(n)._.inside_fused:
|
while token.nbor(n)._.inside_fused:
|
||||||
n += 1
|
n += 1
|
||||||
id_ = '%d-%d' % (i, i+n)
|
id_ = "%d-%d" % (i, i + n)
|
||||||
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
|
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
|
||||||
else:
|
else:
|
||||||
lines = []
|
lines = []
|
||||||
if token.head.i == token.i:
|
if token.head.i == token.i:
|
||||||
head = 0
|
head = 0
|
||||||
else:
|
else:
|
||||||
head = i + (token.head.i - token.i) + 1
|
head = i + (token.head.i - token.i) + 1
|
||||||
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
|
fields = [
|
||||||
str(head), token.dep_.lower(), '_', '_']
|
str(i + 1),
|
||||||
lines.append('\t'.join(fields))
|
token.text,
|
||||||
return '\n'.join(lines)
|
token.lemma_,
|
||||||
|
token.pos_,
|
||||||
|
token.tag_,
|
||||||
|
"_",
|
||||||
|
str(head),
|
||||||
|
token.dep_.lower(),
|
||||||
|
"_",
|
||||||
|
"_",
|
||||||
|
]
|
||||||
|
lines.append("\t".join(fields))
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
Token.set_extension('get_conllu_lines', method=get_token_conllu)
|
|
||||||
Token.set_extension('begins_fused', default=False)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||||
Token.set_extension('inside_fused', default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
|
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)
|
||||||
|
|
||||||
|
|
||||||
def load_nlp(corpus, config, vectors=None):
|
def load_nlp(corpus, config, vectors=None):
|
||||||
lang = corpus.split('_')[0]
|
lang = corpus.split("_")[0]
|
||||||
nlp = spacy.blank(lang)
|
nlp = spacy.blank(lang)
|
||||||
if config.vectors:
|
if config.vectors:
|
||||||
if not vectors:
|
if not vectors:
|
||||||
raise ValueError("config asks for vectors, but no vectors "
|
raise ValueError(
|
||||||
"directory set on command line (use -v)")
|
"config asks for vectors, but no vectors "
|
||||||
|
"directory set on command line (use -v)"
|
||||||
|
)
|
||||||
if (Path(vectors) / corpus).exists():
|
if (Path(vectors) / corpus).exists():
|
||||||
nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab')
|
nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
|
||||||
nlp.meta['treebank'] = corpus
|
nlp.meta["treebank"] = corpus
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
if config.multitask_tag:
|
if config.multitask_tag:
|
||||||
nlp.parser.add_multitask_objective('tag')
|
nlp.parser.add_multitask_objective("tag")
|
||||||
if config.multitask_sent:
|
if config.multitask_sent:
|
||||||
nlp.parser.add_multitask_objective('sent_start')
|
nlp.parser.add_multitask_objective("sent_start")
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
if torch is not None and device != -1:
|
if torch is not None and device != -1:
|
||||||
torch.set_default_tensor_type('torch.cuda.FloatTensor')
|
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||||
optimizer = nlp.begin_training(
|
optimizer = nlp.begin_training(
|
||||||
lambda: golds_to_gold_tuples(docs, golds), device=device,
|
lambda: golds_to_gold_tuples(docs, golds),
|
||||||
subword_features=config.subword_features, conv_depth=config.conv_depth,
|
device=device,
|
||||||
bilstm_depth=config.bilstm_depth)
|
subword_features=config.subword_features,
|
||||||
|
conv_depth=config.conv_depth,
|
||||||
|
bilstm_depth=config.bilstm_depth,
|
||||||
|
)
|
||||||
if config.pretrained_tok2vec:
|
if config.pretrained_tok2vec:
|
||||||
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
|
||||||
return optimizer
|
return optimizer
|
||||||
|
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with Path(loc).open('rb') as file_:
|
with Path(loc).open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'):
|
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.tok2vec.from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# Command line helpers #
|
# Command line helpers #
|
||||||
########################
|
########################
|
||||||
|
|
||||||
|
|
||||||
class Config(object):
|
class Config(object):
|
||||||
def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False,
|
def __init__(
|
||||||
multitask_sent=False, multitask_dep=False, multitask_vectors=None,
|
self,
|
||||||
bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750,
|
vectors=None,
|
||||||
batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True,
|
max_doc_length=10,
|
||||||
vectors_dir=None, pretrained_tok2vec=None):
|
multitask_tag=False,
|
||||||
|
multitask_sent=False,
|
||||||
|
multitask_dep=False,
|
||||||
|
multitask_vectors=None,
|
||||||
|
bilstm_depth=0,
|
||||||
|
nr_epoch=30,
|
||||||
|
min_batch_size=100,
|
||||||
|
max_batch_size=1000,
|
||||||
|
batch_by_words=True,
|
||||||
|
dropout=0.2,
|
||||||
|
conv_depth=4,
|
||||||
|
subword_features=True,
|
||||||
|
vectors_dir=None,
|
||||||
|
pretrained_tok2vec=None,
|
||||||
|
):
|
||||||
if vectors_dir is not None:
|
if vectors_dir is not None:
|
||||||
if vectors is None:
|
if vectors is None:
|
||||||
vectors = True
|
vectors = True
|
||||||
|
@ -346,13 +396,13 @@ class Config(object):
|
||||||
multitask_vectors = True
|
multitask_vectors = True
|
||||||
for key, value in locals().items():
|
for key, value in locals().items():
|
||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, loc, vectors_dir=None):
|
def load(cls, loc, vectors_dir=None):
|
||||||
with Path(loc).open('r', encoding='utf8') as file_:
|
with Path(loc).open("r", encoding="utf8") as file_:
|
||||||
cfg = json.load(file_)
|
cfg = json.load(file_)
|
||||||
if vectors_dir is not None:
|
if vectors_dir is not None:
|
||||||
cfg['vectors_dir'] = vectors_dir
|
cfg["vectors_dir"] = vectors_dir
|
||||||
return cls(**cfg)
|
return cls(**cfg)
|
||||||
|
|
||||||
|
|
||||||
|
@ -364,43 +414,59 @@ class Dataset(object):
|
||||||
self.text = None
|
self.text = None
|
||||||
for file_path in self.path.iterdir():
|
for file_path in self.path.iterdir():
|
||||||
name = file_path.parts[-1]
|
name = file_path.parts[-1]
|
||||||
if section in name and name.endswith('conllu'):
|
if section in name and name.endswith("conllu"):
|
||||||
self.conllu = file_path
|
self.conllu = file_path
|
||||||
elif section in name and name.endswith('txt'):
|
elif section in name and name.endswith("txt"):
|
||||||
self.text = file_path
|
self.text = file_path
|
||||||
if self.conllu is None:
|
if self.conllu is None:
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
msg = "Could not find .txt file in {path} for {section}"
|
||||||
raise IOError(msg.format(section=section, path=path))
|
raise IOError(msg.format(section=section, path=path))
|
||||||
if self.text is None:
|
if self.text is None:
|
||||||
msg = "Could not find .txt file in {path} for {section}"
|
msg = "Could not find .txt file in {path} for {section}"
|
||||||
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]
|
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
|
||||||
|
|
||||||
|
|
||||||
class TreebankPaths(object):
|
class TreebankPaths(object):
|
||||||
def __init__(self, ud_path, treebank, **cfg):
|
def __init__(self, ud_path, treebank, **cfg):
|
||||||
self.train = Dataset(ud_path / treebank, 'train')
|
self.train = Dataset(ud_path / treebank, "train")
|
||||||
self.dev = Dataset(ud_path / treebank, 'dev')
|
self.dev = Dataset(ud_path / treebank, "dev")
|
||||||
self.lang = self.train.lang
|
self.lang = self.train.lang
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
|
||||||
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
corpus=(
|
||||||
"positional", None, str),
|
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
|
||||||
|
"positional",
|
||||||
|
None,
|
||||||
|
str,
|
||||||
|
),
|
||||||
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
parses_dir=("Directory to write the development parses", "positional", None, Path),
|
||||||
config=("Path to json formatted config file", "option", "C", Path),
|
config=("Path to json formatted config file", "option", "C", Path),
|
||||||
limit=("Size limit", "option", "n", int),
|
limit=("Size limit", "option", "n", int),
|
||||||
gpu_device=("Use GPU", "option", "g", int),
|
gpu_device=("Use GPU", "option", "g", int),
|
||||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||||
vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
|
vectors_dir=(
|
||||||
"option", "v", Path),
|
"Path to directory with pre-trained vectors, named e.g. en/",
|
||||||
|
"option",
|
||||||
|
"v",
|
||||||
|
Path,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None,
|
def main(
|
||||||
use_oracle_segments=False):
|
ud_dir,
|
||||||
|
parses_dir,
|
||||||
|
corpus,
|
||||||
|
config=None,
|
||||||
|
limit=0,
|
||||||
|
gpu_device=-1,
|
||||||
|
vectors_dir=None,
|
||||||
|
use_oracle_segments=False,
|
||||||
|
):
|
||||||
spacy.util.fix_random_seed()
|
spacy.util.fix_random_seed()
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
|
|
||||||
if config is not None:
|
if config is not None:
|
||||||
config = Config.load(config, vectors_dir=vectors_dir)
|
config = Config.load(config, vectors_dir=vectors_dir)
|
||||||
else:
|
else:
|
||||||
|
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||||
|
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
docs, golds = read_data(
|
||||||
max_doc_length=config.max_doc_length,
|
nlp,
|
||||||
limit=limit)
|
paths.train.conllu.open(),
|
||||||
|
paths.train.text.open(),
|
||||||
|
max_doc_length=config.max_doc_length,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
||||||
|
|
||||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||||
for i in range(config.nr_epoch):
|
for i in range(config.nr_epoch):
|
||||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
docs, golds = read_data(
|
||||||
max_doc_length=config.max_doc_length, limit=limit,
|
nlp,
|
||||||
oracle_segments=use_oracle_segments,
|
paths.train.conllu.open(),
|
||||||
raw_text=not use_oracle_segments)
|
paths.train.text.open(),
|
||||||
|
max_doc_length=config.max_doc_length,
|
||||||
|
limit=limit,
|
||||||
|
oracle_segments=use_oracle_segments,
|
||||||
|
raw_text=not use_oracle_segments,
|
||||||
|
)
|
||||||
Xs = list(zip(docs, golds))
|
Xs = list(zip(docs, golds))
|
||||||
random.shuffle(Xs)
|
random.shuffle(Xs)
|
||||||
if config.batch_by_words:
|
if config.batch_by_words:
|
||||||
|
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_docs, batch_gold = zip(*batch)
|
batch_docs, batch_gold = zip(*batch)
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||||
nlp.parser.cfg['beam_update_prob'] = next(beam_prob)
|
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||||
nlp.update(batch_docs, batch_gold, sgd=optimizer,
|
nlp.update(
|
||||||
drop=config.dropout, losses=losses)
|
batch_docs,
|
||||||
|
batch_gold,
|
||||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
sgd=optimizer,
|
||||||
|
drop=config.dropout,
|
||||||
|
losses=losses,
|
||||||
|
)
|
||||||
|
|
||||||
|
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||||
with nlp.use_params(optimizer.averages):
|
with nlp.use_params(optimizer.averages):
|
||||||
if use_oracle_segments:
|
if use_oracle_segments:
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
|
parsed_docs, scores = evaluate(
|
||||||
paths.dev.conllu, out_path)
|
nlp, paths.dev.conllu, paths.dev.conllu, out_path
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
parsed_docs, scores = evaluate(nlp, paths.dev.text,
|
parsed_docs, scores = evaluate(
|
||||||
paths.dev.conllu, out_path)
|
nlp, paths.dev.text, paths.dev.conllu, out_path
|
||||||
|
)
|
||||||
print_progress(i, losses, scores)
|
print_progress(i, losses, scores)
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
def _render_parses(i, to_render):
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
to_render[0].user_data["title"] = "Batch %d" % i
|
||||||
with Path('/tmp/parses.html').open('w') as file_:
|
with Path("/tmp/parses.html").open("w") as file_:
|
||||||
html = displacy.render(to_render[:5], style='dep', page=True)
|
html = displacy.render(to_render[:5], style="dep", page=True)
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
plac.call(main)
|
plac.call(main)
|
|
@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
|
||||||
import pkg_resources
|
import pkg_resources
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
|
||||||
import requests
|
import requests
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._messages import Messages
|
from ._messages import Messages
|
||||||
from ..compat import path2str, locale_escape
|
from ..compat import path2str
|
||||||
from ..util import prints, get_data_path, read_json
|
from ..util import get_data_path, read_json
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
|
||||||
def validate():
|
def validate():
|
||||||
"""Validate that the currently installed version of spaCy is compatible
|
"""
|
||||||
|
Validate that the currently installed version of spaCy is compatible
|
||||||
with the installed models. Should be run after `pip install -U spacy`.
|
with the installed models. Should be run after `pip install -U spacy`.
|
||||||
"""
|
"""
|
||||||
r = requests.get(about.__compatibility__)
|
msg = Printer()
|
||||||
if r.status_code != 200:
|
with msg.loading("Loading compatibility table..."):
|
||||||
prints(Messages.M021, title=Messages.M003.format(code=r.status_code),
|
r = requests.get(about.__compatibility__)
|
||||||
exits=1)
|
if r.status_code != 200:
|
||||||
compat = r.json()['spacy']
|
msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
|
||||||
|
msg.good("Loaded compatibility table")
|
||||||
|
compat = r.json()["spacy"]
|
||||||
current_compat = compat.get(about.__version__)
|
current_compat = compat.get(about.__version__)
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
prints(about.__compatibility__, exits=1,
|
msg.fail(
|
||||||
title=Messages.M022.format(version=about.__version__))
|
Messages.M022.format(version=about.__version__),
|
||||||
|
about.__compatibility__,
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
all_models = set()
|
all_models = set()
|
||||||
for spacy_v, models in dict(compat).items():
|
for spacy_v, models in dict(compat).items():
|
||||||
all_models.update(models.keys())
|
all_models.update(models.keys())
|
||||||
|
@ -33,33 +39,38 @@ def validate():
|
||||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||||
model_links = get_model_links(current_compat)
|
model_links = get_model_links(current_compat)
|
||||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||||
incompat_links = {l for l, d in model_links.items() if not d['compat']}
|
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
|
||||||
incompat_models = {d['name'] for _, d in model_pkgs.items()
|
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||||
if not d['compat']}
|
incompat_models.update(
|
||||||
incompat_models.update([d['name'] for _, d in model_links.items()
|
[d["name"] for _, d in model_links.items() if not d["compat"]]
|
||||||
if not d['compat']])
|
)
|
||||||
na_models = [m for m in incompat_models if m not in current_compat]
|
na_models = [m for m in incompat_models if m not in current_compat]
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
|
spacy_dir = Path(__file__).parent.parent
|
||||||
|
|
||||||
|
msg.divider(Messages.M023.format(version=about.__version__))
|
||||||
|
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||||
|
|
||||||
prints(path2str(Path(__file__).parent.parent),
|
|
||||||
title=Messages.M023.format(version=about.__version__))
|
|
||||||
if model_links or model_pkgs:
|
if model_links or model_pkgs:
|
||||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||||
|
rows = []
|
||||||
for name, data in model_pkgs.items():
|
for name, data in model_pkgs.items():
|
||||||
print(get_model_row(current_compat, name, data, 'package'))
|
rows.append(get_model_row(current_compat, name, data, msg))
|
||||||
for name, data in model_links.items():
|
for name, data in model_links.items():
|
||||||
print(get_model_row(current_compat, name, data, 'link'))
|
rows.append(get_model_row(current_compat, name, data, msg, "link"))
|
||||||
|
msg.table(rows, header=header)
|
||||||
else:
|
else:
|
||||||
prints(Messages.M024, exits=0)
|
msg.text(Messages.M024, exits=0)
|
||||||
if update_models:
|
if update_models:
|
||||||
cmd = ' python -m spacy download {}'
|
msg.divider("Install updates")
|
||||||
print("\n " + Messages.M025)
|
cmd = "python -m spacy download {}"
|
||||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
prints(Messages.M025.format(version=about.__version__,
|
msg.text(
|
||||||
models=', '.join(na_models)))
|
Messages.M025.format(version=about.__version__, models=", ".join(na_models))
|
||||||
|
)
|
||||||
if incompat_links:
|
if incompat_links:
|
||||||
prints(Messages.M027.format(path=path2str(get_data_path())))
|
msg.text(Messages.M027.format(path=path2str(get_data_path())))
|
||||||
if incompat_models or incompat_links:
|
if incompat_models or incompat_links:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -70,50 +81,48 @@ def get_model_links(compat):
|
||||||
if data_path:
|
if data_path:
|
||||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||||
for model in models:
|
for model in models:
|
||||||
meta_path = Path(model) / 'meta.json'
|
meta_path = Path(model) / "meta.json"
|
||||||
if not meta_path.exists():
|
if not meta_path.exists():
|
||||||
continue
|
continue
|
||||||
meta = read_json(meta_path)
|
meta = read_json(meta_path)
|
||||||
link = model.parts[-1]
|
link = model.parts[-1]
|
||||||
name = meta['lang'] + '_' + meta['name']
|
name = meta["lang"] + "_" + meta["name"]
|
||||||
links[link] = {'name': name, 'version': meta['version'],
|
links[link] = {
|
||||||
'compat': is_compat(compat, name, meta['version'])}
|
"name": name,
|
||||||
|
"version": meta["version"],
|
||||||
|
"compat": is_compat(compat, name, meta["version"]),
|
||||||
|
}
|
||||||
return links
|
return links
|
||||||
|
|
||||||
|
|
||||||
def get_model_pkgs(compat, all_models):
|
def get_model_pkgs(compat, all_models):
|
||||||
pkgs = {}
|
pkgs = {}
|
||||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||||
package = pkg_name.replace('-', '_')
|
package = pkg_name.replace("-", "_")
|
||||||
if package in all_models:
|
if package in all_models:
|
||||||
version = pkg_data.version
|
version = pkg_data.version
|
||||||
pkgs[pkg_name] = {'name': package, 'version': version,
|
pkgs[pkg_name] = {
|
||||||
'compat': is_compat(compat, package, version)}
|
"name": package,
|
||||||
|
"version": version,
|
||||||
|
"compat": is_compat(compat, package, version),
|
||||||
|
}
|
||||||
return pkgs
|
return pkgs
|
||||||
|
|
||||||
|
|
||||||
def get_model_row(compat, name, data, type='package'):
|
def get_model_row(compat, name, data, msg, model_type="package"):
|
||||||
tpl_red = '\x1b[38;5;1m{}\x1b[0m'
|
if data["compat"]:
|
||||||
tpl_green = '\x1b[38;5;2m{}\x1b[0m'
|
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||||
if data['compat']:
|
version = msg.text(data["version"], color="green", no_print=True)
|
||||||
comp = tpl_green.format(locale_escape('✔', errors='ignore'))
|
|
||||||
version = tpl_green.format(data['version'])
|
|
||||||
else:
|
else:
|
||||||
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0])
|
version = msg.text(data["version"], color="red", no_print=True)
|
||||||
version = tpl_red.format(data['version'])
|
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||||
return get_row(type, name, data['name'], version, comp)
|
return (model_type, name, data["name"], version, comp)
|
||||||
|
|
||||||
|
|
||||||
def get_row(*args):
|
|
||||||
tpl_row = ' {:<10}' + (' {:<20}' * 4)
|
|
||||||
return tpl_row.format(*args)
|
|
||||||
|
|
||||||
|
|
||||||
def is_model_path(model_path):
|
def is_model_path(model_path):
|
||||||
exclude = ['cache', 'pycache', '__pycache__']
|
exclude = ["cache", "pycache", "__pycache__"]
|
||||||
name = model_path.parts[-1]
|
name = model_path.parts[-1]
|
||||||
return (model_path.is_dir() and name not in exclude
|
return model_path.is_dir() and name not in exclude and not name.startswith(".")
|
||||||
and not name.startswith('.'))
|
|
||||||
|
|
||||||
|
|
||||||
def is_compat(compat, name, version):
|
def is_compat(compat, name, version):
|
||||||
|
@ -122,6 +131,6 @@ def is_compat(compat, name, version):
|
||||||
|
|
||||||
def reformat_version(version):
|
def reformat_version(version):
|
||||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||||
if version.endswith('-alpha'):
|
if version.endswith("-alpha"):
|
||||||
return version.replace('-alpha', 'a0')
|
return version.replace("-alpha", "a0")
|
||||||
return version.replace('-alpha', 'a')
|
return version.replace("-alpha", "a")
|
||||||
|
|
|
@ -1,59 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
|
||||||
import json
|
|
||||||
import spacy
|
|
||||||
import numpy
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ..vectors import Vectors
|
|
||||||
from ..util import prints, ensure_path
|
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
|
||||||
lang=("model language", "positional", None, str),
|
|
||||||
output_dir=("model output directory", "positional", None, Path),
|
|
||||||
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
|
|
||||||
None, Path),
|
|
||||||
vectors_loc=("optional: location of vectors data, as numpy .npz",
|
|
||||||
"positional", None, str),
|
|
||||||
prune_vectors=("optional: number of vectors to prune to.",
|
|
||||||
"option", "V", int)
|
|
||||||
)
|
|
||||||
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
|
|
||||||
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
|
|
||||||
if not lexemes_loc.exists():
|
|
||||||
prints(lexemes_loc, title="Can't find lexical data", exits=1)
|
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
|
||||||
nlp = spacy.blank(lang)
|
|
||||||
for word in nlp.vocab:
|
|
||||||
word.rank = 0
|
|
||||||
lex_added = 0
|
|
||||||
with lexemes_loc.open() as file_:
|
|
||||||
for line in file_:
|
|
||||||
if line.strip():
|
|
||||||
attrs = json.loads(line)
|
|
||||||
if 'settings' in attrs:
|
|
||||||
nlp.vocab.cfg.update(attrs['settings'])
|
|
||||||
else:
|
|
||||||
lex = nlp.vocab[attrs['orth']]
|
|
||||||
lex.set_attrs(**attrs)
|
|
||||||
assert lex.rank == attrs['id']
|
|
||||||
lex_added += 1
|
|
||||||
if vectors_loc is not None:
|
|
||||||
vector_data = numpy.load(vectors_loc.open('rb'))
|
|
||||||
nlp.vocab.vectors = Vectors(data=vector_data)
|
|
||||||
for word in nlp.vocab:
|
|
||||||
if word.rank:
|
|
||||||
nlp.vocab.vectors.add(word.orth, row=word.rank)
|
|
||||||
|
|
||||||
if prune_vectors >= 1:
|
|
||||||
remap = nlp.vocab.prune_vectors(prune_vectors)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
vec_added = len(nlp.vocab.vectors)
|
|
||||||
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
|
|
||||||
title="Sucessfully compiled vocab and vectors, and saved model")
|
|
||||||
return nlp
|
|
|
@ -5,7 +5,6 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import ujson
|
import ujson
|
||||||
import itertools
|
import itertools
|
||||||
import locale
|
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
|
||||||
|
@ -136,12 +135,3 @@ def import_file(name, loc):
|
||||||
module = importlib.util.module_from_spec(spec)
|
module = importlib.util.module_from_spec(spec)
|
||||||
spec.loader.exec_module(module)
|
spec.loader.exec_module(module)
|
||||||
return module
|
return module
|
||||||
|
|
||||||
|
|
||||||
def locale_escape(string, errors="replace"):
|
|
||||||
"""
|
|
||||||
Mangle non-supported characters, for savages with ascii terminals.
|
|
||||||
"""
|
|
||||||
encoding = locale.getpreferredencoding()
|
|
||||||
string = string.encode(encoding, errors).decode("utf8")
|
|
||||||
return string
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from .render import DependencyRenderer, EntityRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..compat import b_to_str
|
from ..compat import b_to_str
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import prints, is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
|
||||||
|
|
||||||
_html = {}
|
_html = {}
|
||||||
|
@ -72,14 +72,12 @@ def serve(
|
||||||
|
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server("0.0.0.0", port, app)
|
httpd = simple_server.make_server("0.0.0.0", port, app)
|
||||||
prints(
|
print("\nUsing the '{}' visualizer".format(style))
|
||||||
"Using the '{}' visualizer".format(style),
|
print("Serving on port {}...\n".format(port))
|
||||||
title="Serving on port {}...".format(port),
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
prints("Shutting down server on port {}.".format(port))
|
print("Shutting down server on port {}.".format(port))
|
||||||
finally:
|
finally:
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
|
|
||||||
|
|
|
@ -278,6 +278,12 @@ class Errors(object):
|
||||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
||||||
" can only be part of one entity, so make sure the entities you're "
|
" can only be part of one entity, so make sure the entities you're "
|
||||||
"setting don't overlap.")
|
"setting don't overlap.")
|
||||||
|
E104 = ("Can't find JSON schema for '{name}'.")
|
||||||
|
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
||||||
|
"Doc.json() instead.")
|
||||||
|
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||||
|
"settings: {opts}")
|
||||||
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
113
spacy/gold.pyx
113
spacy/gold.pyx
|
@ -15,7 +15,7 @@ import json
|
||||||
|
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from . import _align
|
from . import _align
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
@ -172,7 +172,7 @@ class GoldCorpus(object):
|
||||||
def dev_tuples(self):
|
def dev_tuples(self):
|
||||||
locs = (self.tmp_dir / 'dev').iterdir()
|
locs = (self.tmp_dir / 'dev').iterdir()
|
||||||
yield from self.read_tuples(locs, limit=self.limit)
|
yield from self.read_tuples(locs, limit=self.limit)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def train_tuples(self):
|
def train_tuples(self):
|
||||||
locs = (self.tmp_dir / 'train').iterdir()
|
locs = (self.tmp_dir / 'train').iterdir()
|
||||||
|
@ -271,6 +271,53 @@ def _corrupt(c, noise_level):
|
||||||
return c.lower()
|
return c.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_object(json_corpus_section):
|
||||||
|
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
||||||
|
training data file) and yield tuples in the GoldParse format.
|
||||||
|
|
||||||
|
json_corpus_section (list): The data.
|
||||||
|
YIELDS (tuple): The reformatted data.
|
||||||
|
"""
|
||||||
|
for json_doc in json_corpus_section:
|
||||||
|
tuple_doc = json_to_tuple(json_doc)
|
||||||
|
for tuple_paragraph in tuple_doc:
|
||||||
|
yield tuple_paragraph
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_tuple(doc):
|
||||||
|
"""Convert an item in the JSON-formatted training data to the tuple format
|
||||||
|
used by GoldParse.
|
||||||
|
|
||||||
|
doc (dict): One entry in the training data.
|
||||||
|
YIELDS (tuple): The reformatted data.
|
||||||
|
"""
|
||||||
|
paragraphs = []
|
||||||
|
for paragraph in doc['paragraphs']:
|
||||||
|
sents = []
|
||||||
|
for sent in paragraph['sentences']:
|
||||||
|
words = []
|
||||||
|
ids = []
|
||||||
|
tags = []
|
||||||
|
heads = []
|
||||||
|
labels = []
|
||||||
|
ner = []
|
||||||
|
for i, token in enumerate(sent['tokens']):
|
||||||
|
words.append(token['orth'])
|
||||||
|
ids.append(i)
|
||||||
|
tags.append(token.get('tag', '-'))
|
||||||
|
heads.append(token.get('head', 0) + i)
|
||||||
|
labels.append(token.get('dep', ''))
|
||||||
|
# Ensure ROOT label is case-insensitive
|
||||||
|
if labels[-1].lower() == 'root':
|
||||||
|
labels[-1] = 'ROOT'
|
||||||
|
ner.append(token.get('ner', '-'))
|
||||||
|
sents.append([
|
||||||
|
[ids, words, tags, heads, labels, ner],
|
||||||
|
sent.get('brackets', [])])
|
||||||
|
if sents:
|
||||||
|
yield [paragraph.get('raw', None), sents]
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, limit=None):
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
|
@ -280,31 +327,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for doc in _json_iterate(loc):
|
for doc in _json_iterate(loc):
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
continue
|
continue
|
||||||
paragraphs = []
|
for json_tuple in json_to_tuple(doc):
|
||||||
for paragraph in doc['paragraphs']:
|
yield json_tuple
|
||||||
sents = []
|
|
||||||
for sent in paragraph['sentences']:
|
|
||||||
words = []
|
|
||||||
ids = []
|
|
||||||
tags = []
|
|
||||||
heads = []
|
|
||||||
labels = []
|
|
||||||
ner = []
|
|
||||||
for i, token in enumerate(sent['tokens']):
|
|
||||||
words.append(token['orth'])
|
|
||||||
ids.append(i)
|
|
||||||
tags.append(token.get('tag', '-'))
|
|
||||||
heads.append(token.get('head', 0) + i)
|
|
||||||
labels.append(token.get('dep', ''))
|
|
||||||
# Ensure ROOT label is case-insensitive
|
|
||||||
if labels[-1].lower() == 'root':
|
|
||||||
labels[-1] = 'ROOT'
|
|
||||||
ner.append(token.get('ner', '-'))
|
|
||||||
sents.append([
|
|
||||||
[ids, words, tags, heads, labels, ner],
|
|
||||||
sent.get('brackets', [])])
|
|
||||||
if sents:
|
|
||||||
yield [paragraph.get('raw', None), sents]
|
|
||||||
|
|
||||||
|
|
||||||
def _json_iterate(loc):
|
def _json_iterate(loc):
|
||||||
|
@ -573,32 +597,19 @@ cdef class GoldParse:
|
||||||
self.c.sent_start[i] = 0
|
self.c.sent_start[i] = 0
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(id, docs):
|
def docs_to_json(docs, underscore=None):
|
||||||
'''Convert a list of Doc objects into the JSON-serializable format used by
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
the spacy train command. Each Doc in the list will be interpreted as a
|
the spacy train command.
|
||||||
paragraph.
|
|
||||||
'''
|
docs (iterable / Doc): The Doc object(s) to convert.
|
||||||
|
underscore (list): Optional list of string names of custom doc._.
|
||||||
|
attributes. Attribute values need to be JSON-serializable. Values will
|
||||||
|
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||||
|
RETURNS (list): The data in spaCy's JSON format.
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
json_doc = {'id': id, 'paragraphs': []}
|
return [doc.to_json(underscore=underscore) for doc in docs]
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
json_para = {'raw': doc.text, 'sentences': []}
|
|
||||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
|
||||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
|
|
||||||
for j, sent in enumerate(doc.sents):
|
|
||||||
json_sent = {'tokens': [], 'brackets': []}
|
|
||||||
for token in sent:
|
|
||||||
json_token = {"id": token.i, "orth": token.text}
|
|
||||||
if doc.is_tagged:
|
|
||||||
json_token['tag'] = token.tag_
|
|
||||||
if doc.is_parsed:
|
|
||||||
json_token['head'] = token.head.i-token.i
|
|
||||||
json_token['dep'] = token.dep_
|
|
||||||
json_token['ner'] = biluo_tags[token.i]
|
|
||||||
json_sent['tokens'].append(json_token)
|
|
||||||
json_para['sentences'].append(json_sent)
|
|
||||||
json_doc['paragraphs'].append(json_para)
|
|
||||||
return json_doc
|
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
def biluo_tags_from_offsets(doc, entities, missing='O'):
|
||||||
|
|
|
@ -341,21 +341,3 @@ def test_lowest_common_ancestor(en_tokenizer):
|
||||||
assert lca[1, 1] == 1
|
assert lca[1, 1] == 1
|
||||||
assert lca[0, 1] == 2
|
assert lca[0, 1] == 2
|
||||||
assert lca[1, 2] == 2
|
assert lca[1, 2] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_parse_tree(en_tokenizer):
|
|
||||||
"""Tests doc.print_tree() method."""
|
|
||||||
text = "I like New York in Autumn."
|
|
||||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
|
||||||
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
|
|
||||||
# full method parse_tree(text) is a trivial composition
|
|
||||||
trees = doc.print_tree()
|
|
||||||
assert len(trees) > 0
|
|
||||||
tree = trees[0]
|
|
||||||
assert all(
|
|
||||||
k in list(tree.keys())
|
|
||||||
for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
|
|
||||||
)
|
|
||||||
assert tree["word"] == "like" # check root is correct
|
|
||||||
|
|
65
spacy/tests/doc/test_to_json.py
Normal file
65
spacy/tests/doc/test_to_json.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from spacy.cli.schemas import get_schema, validate_json
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def doc(en_vocab):
|
||||||
|
words = ["c", "d", "e"]
|
||||||
|
pos = ["VERB", "NOUN", "NOUN"]
|
||||||
|
tags = ["VBP", "NN", "NN"]
|
||||||
|
heads = [0, -1, -2]
|
||||||
|
deps = ["ROOT", "dobj", "dobj"]
|
||||||
|
ents = [(1, 2, "ORG")]
|
||||||
|
return get_doc(
|
||||||
|
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json(doc):
|
||||||
|
json_doc = doc.to_json()
|
||||||
|
assert json_doc["text"] == "c d e "
|
||||||
|
assert len(json_doc["tokens"]) == 3
|
||||||
|
assert json_doc["tokens"][0]["pos"] == "VERB"
|
||||||
|
assert json_doc["tokens"][0]["tag"] == "VBP"
|
||||||
|
assert json_doc["tokens"][0]["dep"] == "ROOT"
|
||||||
|
assert len(json_doc["ents"]) == 1
|
||||||
|
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
||||||
|
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
||||||
|
assert json_doc["ents"][0]["label"] == "ORG"
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_underscore(doc):
|
||||||
|
Doc.set_extension("json_test1", default=False)
|
||||||
|
Doc.set_extension("json_test2", default=False)
|
||||||
|
doc._.json_test1 = "hello world"
|
||||||
|
doc._.json_test2 = [1, 2, 3]
|
||||||
|
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
||||||
|
assert "_" in json_doc
|
||||||
|
assert json_doc["_"]["json_test1"] == "hello world"
|
||||||
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_underscore_error_attr(doc):
|
||||||
|
"""Test that Doc.to_json() raises an error if a custom attribute doesn't
|
||||||
|
exist in the ._ space."""
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc.to_json(underscore=["json_test3"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_underscore_error_serialize(doc):
|
||||||
|
"""Test that Doc.to_json() raises an error if a custom attribute value
|
||||||
|
isn't JSON-serializable."""
|
||||||
|
Doc.set_extension("json_test4", method=lambda doc: doc.text)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
doc.to_json(underscore=["json_test4"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_to_json_valid_training(doc):
|
||||||
|
json_doc = doc.to_json()
|
||||||
|
errors = validate_json([json_doc], get_schema("training"))
|
||||||
|
assert not errors
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from spacy.gold import docs_to_json
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from .util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_U(en_vocab):
|
def test_gold_biluo_U(en_vocab):
|
||||||
|
@ -52,34 +50,3 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
assert biluo_tags_converted == biluo_tags
|
assert biluo_tags_converted == biluo_tags
|
||||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||||
assert offsets_converted == offsets
|
assert offsets_converted == offsets
|
||||||
|
|
||||||
|
|
||||||
def test_docs_to_json(en_vocab):
|
|
||||||
"""Test we can convert a list of Doc objects into the JSON-serializable
|
|
||||||
format we use for training.
|
|
||||||
"""
|
|
||||||
docs = [
|
|
||||||
get_doc(
|
|
||||||
en_vocab,
|
|
||||||
words=["a", "b"],
|
|
||||||
pos=["VBP", "NN"],
|
|
||||||
heads=[0, -1],
|
|
||||||
deps=["ROOT", "dobj"],
|
|
||||||
ents=[],
|
|
||||||
),
|
|
||||||
get_doc(
|
|
||||||
en_vocab,
|
|
||||||
words=["c", "d", "e"],
|
|
||||||
pos=["VBP", "NN", "NN"],
|
|
||||||
heads=[0, -1, -2],
|
|
||||||
deps=["ROOT", "dobj", "dobj"],
|
|
||||||
ents=[(1, 2, "ORG")],
|
|
||||||
),
|
|
||||||
]
|
|
||||||
json_doc = docs_to_json(0, docs)
|
|
||||||
assert json_doc["id"] == 0
|
|
||||||
assert len(json_doc["paragraphs"]) == 2
|
|
||||||
assert len(json_doc["paragraphs"][0]["sentences"]) == 1
|
|
||||||
assert len(json_doc["paragraphs"][1]["sentences"]) == 1
|
|
||||||
assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
|
|
||||||
assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3
|
|
||||||
|
|
44
spacy/tests/test_json_schemas.py
Normal file
44
spacy/tests/test_json_schemas.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.cli.schemas import validate_json, get_schema
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def training_schema():
|
||||||
|
return get_schema("training")
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_schema_get():
|
||||||
|
schema = get_schema("training")
|
||||||
|
assert schema
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
schema = get_schema("xxx")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"data",
|
||||||
|
[
|
||||||
|
{"text": "Hello world"},
|
||||||
|
{"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_json_schema_training_valid(data, training_schema):
|
||||||
|
errors = validate_json([data], training_schema)
|
||||||
|
assert not errors
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"data,n_errors",
|
||||||
|
[
|
||||||
|
({"spans": []}, 1),
|
||||||
|
({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2),
|
||||||
|
({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1),
|
||||||
|
({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1),
|
||||||
|
({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_json_schema_training_invalid(data, n_errors, training_schema):
|
||||||
|
errors = validate_json([data], training_schema)
|
||||||
|
assert len(errors) == n_errors
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy import util
|
from spacy import util
|
||||||
|
|
|
@ -20,7 +20,6 @@ from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from .printers import parse_tree
|
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t, flags_t
|
||||||
from ..attrs import intify_attrs, IDS
|
from ..attrs import intify_attrs, IDS
|
||||||
|
@ -29,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
|
||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
|
||||||
from ..attrs cimport ENT_TYPE, SENT_START
|
from ..attrs cimport ENT_TYPE, SENT_START
|
||||||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice, is_json_serializable
|
||||||
from ..compat import is_config, copy_reg, pickle, basestring_
|
from ..compat import is_config, copy_reg, pickle, basestring_
|
||||||
from ..errors import deprecation_warning, models_warning, user_warning
|
from ..errors import deprecation_warning, models_warning, user_warning
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
@ -959,31 +958,48 @@ cdef class Doc:
|
||||||
return self[start]
|
return self[start]
|
||||||
|
|
||||||
def print_tree(self, light=False, flat=False):
|
def print_tree(self, light=False, flat=False):
|
||||||
"""Returns the parse trees in JSON (dict) format.
|
raise ValueError(Errors.E105)
|
||||||
|
|
||||||
light (bool): Don't include lemmas or entities.
|
def to_json(self, underscore=None):
|
||||||
flat (bool): Don't include arcs or modifiers.
|
"""Convert a Doc to JSON. Produces the same format used by the spacy
|
||||||
RETURNS (dict): Parse tree as dict.
|
train command.
|
||||||
|
|
||||||
EXAMPLE:
|
underscore (list): Optional list of string names of custom doc._.
|
||||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
attributes. Attribute values need to be JSON-serializable. Values will
|
||||||
>>> trees = doc.print_tree()
|
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
|
||||||
>>> trees[1]
|
RETURNS (dict): The data in spaCy's JSON format.
|
||||||
{'modifiers': [
|
|
||||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
|
|
||||||
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
|
|
||||||
'lemma': 'Alice'},
|
|
||||||
{'modifiers': [
|
|
||||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
|
||||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
|
||||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
|
||||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
|
||||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
|
||||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
|
||||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
|
||||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
|
||||||
"""
|
"""
|
||||||
return parse_tree(self, light=light, flat=flat)
|
data = {'text': self.text}
|
||||||
|
data['ents'] = [{'start': ent.start_char, 'end': ent.end_char,
|
||||||
|
'label': ent.label_} for ent in self.ents]
|
||||||
|
sents = list(self.sents)
|
||||||
|
if sents:
|
||||||
|
data['sents'] = [{'start': sent.start_char, 'end': sent.end_char}
|
||||||
|
for sent in sents]
|
||||||
|
if self.cats:
|
||||||
|
data['cats'] = self.cats
|
||||||
|
data['tokens'] = []
|
||||||
|
for token in self:
|
||||||
|
token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)}
|
||||||
|
if token.pos_:
|
||||||
|
token_data['pos'] = token.pos_
|
||||||
|
if token.tag_:
|
||||||
|
token_data['tag'] = token.tag_
|
||||||
|
if token.dep_:
|
||||||
|
token_data['dep'] = token.dep_
|
||||||
|
if token.head:
|
||||||
|
token_data['head'] = token.head.i
|
||||||
|
data['tokens'].append(token_data)
|
||||||
|
if underscore:
|
||||||
|
data['_'] = {}
|
||||||
|
for attr in underscore:
|
||||||
|
if not self.has_extension(attr):
|
||||||
|
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
|
||||||
|
value = self._.get(attr)
|
||||||
|
if not is_json_serializable(value):
|
||||||
|
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
|
||||||
|
data['_'][attr] = value
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
|
||||||
|
|
|
@ -1,74 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .doc import Doc
|
|
||||||
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
|
||||||
|
|
||||||
|
|
||||||
def merge_ents(doc):
|
|
||||||
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
|
|
||||||
for ent in doc.ents:
|
|
||||||
ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def format_POS(token, light, flat):
|
|
||||||
"""Helper: form the POS output for a token."""
|
|
||||||
subtree = dict([
|
|
||||||
("word", token.text),
|
|
||||||
("lemma", token.lemma_), # trigger
|
|
||||||
("NE", token.ent_type_), # trigger
|
|
||||||
("POS_fine", token.tag_),
|
|
||||||
("POS_coarse", token.pos_),
|
|
||||||
("arc", token.dep_),
|
|
||||||
("modifiers", [])
|
|
||||||
])
|
|
||||||
if light:
|
|
||||||
subtree.pop("lemma")
|
|
||||||
subtree.pop("NE")
|
|
||||||
if flat:
|
|
||||||
subtree.pop("arc")
|
|
||||||
subtree.pop("modifiers")
|
|
||||||
return subtree
|
|
||||||
|
|
||||||
|
|
||||||
def POS_tree(root, light=False, flat=False):
|
|
||||||
"""Helper: generate a POS tree for a root token. The doc must have
|
|
||||||
`merge_ents(doc)` ran on it.
|
|
||||||
"""
|
|
||||||
subtree = format_POS(root, light=light, flat=flat)
|
|
||||||
for c in root.children:
|
|
||||||
subtree["modifiers"].append(POS_tree(c))
|
|
||||||
return subtree
|
|
||||||
|
|
||||||
|
|
||||||
def parse_tree(doc, light=False, flat=False):
|
|
||||||
"""Make a copy of the doc and construct a syntactic parse tree similar to
|
|
||||||
displaCy. Generates the POS tree for all sentences in a doc.
|
|
||||||
|
|
||||||
doc (Doc): The doc for parsing.
|
|
||||||
RETURNS (dict): The parse tree.
|
|
||||||
|
|
||||||
EXAMPLE:
|
|
||||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
|
||||||
>>> trees = doc.print_tree()
|
|
||||||
>>> trees[1]
|
|
||||||
{'modifiers': [
|
|
||||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
|
||||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
|
||||||
{'modifiers': [
|
|
||||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
|
||||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
|
||||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
|
||||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
|
||||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
|
||||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
|
||||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
|
||||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
|
||||||
"""
|
|
||||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
|
||||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
|
||||||
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
|
||||||
merge_ents(doc_clone) # merge the entities into single tokens first
|
|
||||||
return [POS_tree(sent.root, light=light, flat=flat)
|
|
||||||
for sent in doc_clone.sents]
|
|
|
@ -7,8 +7,6 @@ import pkg_resources
|
||||||
import importlib
|
import importlib
|
||||||
import regex as re
|
import regex as re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
|
||||||
import textwrap
|
|
||||||
import random
|
import random
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
|
@ -18,9 +16,10 @@ import cytoolz
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
|
|
||||||
from .symbols import ORTH
|
from .symbols import ORTH
|
||||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||||
from .compat import import_file
|
from .compat import import_file, json_dumps
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
# Import these directly from Thinc, so that we're sure we always have the
|
# Import these directly from Thinc, so that we're sure we always have the
|
||||||
|
@ -541,6 +540,16 @@ def read_json(location):
|
||||||
return ujson.load(f)
|
return ujson.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def write_json(file_path, contents):
|
||||||
|
"""Create a .json file and dump contents.
|
||||||
|
|
||||||
|
file_path (unicode / Path): The path to the output file.
|
||||||
|
contents: The JSON-serializable contents to output.
|
||||||
|
"""
|
||||||
|
with Path(file_path).open("w", encoding="utf8") as f:
|
||||||
|
f.write(json_dumps(contents))
|
||||||
|
|
||||||
|
|
||||||
def read_jsonl(file_path):
|
def read_jsonl(file_path):
|
||||||
"""Read a .jsonl file and yield its contents line by line.
|
"""Read a .jsonl file and yield its contents line by line.
|
||||||
|
|
||||||
|
@ -555,6 +564,29 @@ def read_jsonl(file_path):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
def write_jsonl(file_path, lines):
|
||||||
|
"""Create a .jsonl file and dump contents.
|
||||||
|
|
||||||
|
file_path (unicode / Path): The path to the output file.
|
||||||
|
lines (list): The JSON-serializable contents of each line.
|
||||||
|
"""
|
||||||
|
data = [json_dumps(line) for line in lines]
|
||||||
|
with Path(file_path).open("w", encoding="utf-8") as f:
|
||||||
|
f.write("\n".join(data))
|
||||||
|
|
||||||
|
|
||||||
|
def is_json_serializable(obj):
|
||||||
|
"""Check if a Python object is JSON-serializable."""
|
||||||
|
if hasattr(obj, "__call__"):
|
||||||
|
# Check this separately here to prevent infinite recursions
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
ujson.dumps(obj)
|
||||||
|
return True
|
||||||
|
except TypeError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_raw_input(description, default=False):
|
def get_raw_input(description, default=False):
|
||||||
"""Get user input from the command line via raw_input / input.
|
"""Get user input from the command line via raw_input / input.
|
||||||
|
|
||||||
|
@ -602,21 +634,6 @@ def from_disk(path, readers, exclude):
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
def print_table(data, title=None):
|
|
||||||
"""Print data in table format.
|
|
||||||
|
|
||||||
data (dict or list of tuples): Label/value pairs.
|
|
||||||
title (unicode or None): Title, will be printed above.
|
|
||||||
"""
|
|
||||||
if isinstance(data, dict):
|
|
||||||
data = list(data.items())
|
|
||||||
tpl_row = " {:<15}" * len(data[0])
|
|
||||||
table = "\n".join([tpl_row.format(l, unicode_(v)) for l, v in data])
|
|
||||||
if title:
|
|
||||||
print("\n \033[93m{}\033[0m".format(title))
|
|
||||||
print("\n{}\n".format(table))
|
|
||||||
|
|
||||||
|
|
||||||
def print_markdown(data, title=None):
|
def print_markdown(data, title=None):
|
||||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||||
|
|
||||||
|
@ -638,44 +655,6 @@ def print_markdown(data, title=None):
|
||||||
print("\n{}\n".format("\n".join(markdown)))
|
print("\n{}\n".format("\n".join(markdown)))
|
||||||
|
|
||||||
|
|
||||||
def prints(*texts, **kwargs):
|
|
||||||
"""Print formatted message (manual ANSI escape sequences to avoid
|
|
||||||
dependency)
|
|
||||||
|
|
||||||
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
|
||||||
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
|
|
||||||
"""
|
|
||||||
exits = kwargs.get("exits", None)
|
|
||||||
title = kwargs.get("title", None)
|
|
||||||
title = "\033[93m{}\033[0m\n".format(_wrap(title)) if title else ""
|
|
||||||
message = "\n\n".join([_wrap(text) for text in texts])
|
|
||||||
print("\n{}{}\n".format(title, message))
|
|
||||||
if exits is not None:
|
|
||||||
sys.exit(exits)
|
|
||||||
|
|
||||||
|
|
||||||
def _wrap(text, wrap_max=80, indent=4):
|
|
||||||
"""Wrap text at given width using textwrap module.
|
|
||||||
|
|
||||||
text (unicode): Text to wrap. If it's a Path, it's converted to string.
|
|
||||||
wrap_max (int): Maximum line length (indent is deducted).
|
|
||||||
indent (int): Number of spaces for indentation.
|
|
||||||
RETURNS (unicode): Wrapped text.
|
|
||||||
"""
|
|
||||||
indent = indent * " "
|
|
||||||
wrap_width = wrap_max - len(indent)
|
|
||||||
if isinstance(text, Path):
|
|
||||||
text = path2str(text)
|
|
||||||
return textwrap.fill(
|
|
||||||
text,
|
|
||||||
width=wrap_width,
|
|
||||||
initial_indent=indent,
|
|
||||||
subsequent_indent=indent,
|
|
||||||
break_long_words=False,
|
|
||||||
break_on_hyphens=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def minify_html(html):
|
def minify_html(html):
|
||||||
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
"""Perform a template-specific, rudimentary HTML minification for displaCy.
|
||||||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||||
|
|
|
@ -320,37 +320,6 @@ p
|
||||||
+cell dict
|
+cell dict
|
||||||
+cell Combined tokenizer exceptions.
|
+cell Combined tokenizer exceptions.
|
||||||
|
|
||||||
|
|
||||||
+h(3, "util.prints") util.prints
|
|
||||||
+tag function
|
|
||||||
+tag-new(2)
|
|
||||||
|
|
||||||
p
|
|
||||||
| Print a formatted, text-wrapped message with optional title. If a text
|
|
||||||
| argument is a #[code Path], it's converted to a string. Should only
|
|
||||||
| be used for interactive components like the command-line interface.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
data_path = Path('/some/path')
|
|
||||||
if not path.exists():
|
|
||||||
util.prints("Can't find the path.", data_path,
|
|
||||||
title="Error", exits=1)
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code *texts]
|
|
||||||
+cell unicode
|
|
||||||
+cell Texts to print. Each argument is rendered as paragraph.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code **kwargs]
|
|
||||||
+cell -
|
|
||||||
+cell
|
|
||||||
| #[code title] is rendered as coloured headline. #[code exits]
|
|
||||||
| performs system exit after printing, using the value of the
|
|
||||||
| argument as the exit code, e.g. #[code exits=1].
|
|
||||||
|
|
||||||
|
|
||||||
+h(3, "util.minibatch") util.minibatch
|
+h(3, "util.minibatch") util.minibatch
|
||||||
+tag function
|
+tag function
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
|
@ -257,10 +257,19 @@ p
|
||||||
| to allow packaging the model using the
|
| to allow packaging the model using the
|
||||||
| #[+api("cli#package") #[code package]] command.
|
| #[+api("cli#package") #[code package]] command.
|
||||||
|
|
||||||
|
+infobox("Changed in v2.1", "⚠️")
|
||||||
|
| As of spaCy 2.1, the #[code --no-tagger], #[code --no-parser] and
|
||||||
|
| #[code --no-parser] flags have been replaced by a #[code --pipeline]
|
||||||
|
| option, which lets you define comma-separated names of pipeline
|
||||||
|
| components to train. For example, #[code --pipeline tagger,parser] will
|
||||||
|
| only train the tagger and parser.
|
||||||
|
|
||||||
+code(false, "bash", "$", false, false, true).
|
+code(false, "bash", "$", false, false, true).
|
||||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter]
|
python -m spacy train [lang] [output_path] [train_path] [dev_path]
|
||||||
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser]
|
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-examples] [--use-gpu]
|
||||||
[--no-entities] [--gold-preproc] [--verbose]
|
[--version] [--meta-path] [--init-tok2vec] [--parser-multitasks]
|
||||||
|
[--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens]
|
||||||
|
[--verbose]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -269,34 +278,34 @@ p
|
||||||
+cell Model language.
|
+cell Model language.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code output_dir]
|
+cell #[code output_path]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Directory to store model in.
|
+cell Directory to store model in. Will be created if it doesn't exist.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code train_data]
|
+cell #[code train_path]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Location of JSON-formatted training data.
|
+cell Location of JSON-formatted training data.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code dev_data]
|
+cell #[code dev_path]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Location of JSON-formatted development data for evaluation.
|
+cell Location of JSON-formatted development data for evaluation.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --n-iter], #[code -n]
|
+cell #[code --base-model], #[code -b]
|
||||||
+cell option
|
+cell option
|
||||||
+cell Number of iterations (default: #[code 30]).
|
+cell
|
||||||
|
| Optional name of base model to update. Can be any loadable
|
||||||
|
| spaCy model.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --n-sents], #[code -ns]
|
+cell #[code --pipeline], #[code -p]
|
||||||
|
+tag-new("2.1.0")
|
||||||
+cell option
|
+cell option
|
||||||
+cell Number of sentences (default: #[code 0]).
|
+cell
|
||||||
|
| Comma-separated names of pipeline components to train. Defaults
|
||||||
+row
|
| to #[code 'tagger,parser,ner'].
|
||||||
+cell #[code --use-gpu], #[code -g]
|
|
||||||
+cell option
|
|
||||||
+cell Use GPU.
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --vectors], #[code -v]
|
+cell #[code --vectors], #[code -v]
|
||||||
|
@ -304,13 +313,21 @@ p
|
||||||
+cell Model to load vectors from.
|
+cell Model to load vectors from.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --meta-path], #[code -m]
|
+cell #[code --n-iter], #[code -n]
|
||||||
|
+cell option
|
||||||
|
+cell Number of iterations (default: #[code 30]).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --n-examples], #[code -ns]
|
||||||
|
+cell option
|
||||||
|
+cell Number of examples to use (defaults to #[code 0] for all examples).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --use-gpu], #[code -g]
|
||||||
+cell option
|
+cell option
|
||||||
+cell
|
+cell
|
||||||
| #[+tag-new(2)] Optional path to model
|
| Whether to use GPU. Can be either #[code 0], #[code 1] or
|
||||||
| #[+a("/usage/training#models-generating") #[code meta.json]].
|
| #[code -1].
|
||||||
| All relevant properties like #[code lang], #[code pipeline] and
|
|
||||||
| #[code spacy_version] will be overwritten.
|
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --version], #[code -V]
|
+cell #[code --version], #[code -V]
|
||||||
|
@ -320,40 +337,69 @@ p
|
||||||
| #[code meta.json] after training.
|
| #[code meta.json] after training.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --no-tagger], #[code -T]
|
+cell #[code --meta-path], #[code -m]
|
||||||
+cell flag
|
+tag-new(2)
|
||||||
+cell Don't train tagger.
|
+cell option
|
||||||
|
+cell
|
||||||
|
| Optional path to model
|
||||||
|
| #[+a("/usage/training#models-generating") #[code meta.json]].
|
||||||
|
| All relevant properties like #[code lang], #[code pipeline] and
|
||||||
|
| #[code spacy_version] will be overwritten.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --no-parser], #[code -P]
|
+cell #[code --init-tok2vec], #[code -t2v]
|
||||||
+cell flag
|
+tag-new("2.1.0")
|
||||||
+cell Don't train parser.
|
+cell option
|
||||||
|
+cell
|
||||||
|
| Path to pretrained weights for the token-to-vector parts of the
|
||||||
|
| models. See #[code spacy pretrain]. Experimental.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --no-entities], #[code -N]
|
+cell #[code --parser-multitasks], #[code -pt]
|
||||||
+cell flag
|
+cell option
|
||||||
+cell Don't train NER.
|
+cell
|
||||||
|
| Side objectives for parser CNN, e.g. #[code 'dep'] or
|
||||||
|
| #[code 'dep,tag']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --entity-multitasks], #[code -et]
|
||||||
|
+cell option
|
||||||
|
+cell
|
||||||
|
| Side objectives for NER CNN, e.g. #[code 'dep'] or
|
||||||
|
| #[code 'dep,tag']
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --noise-level], #[code -nl]
|
||||||
|
+cell option
|
||||||
|
+cell Float indicating the amount of corruption for data agumentation.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --gold-preproc], #[code -G]
|
+cell #[code --gold-preproc], #[code -G]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Use gold preprocessing.
|
+cell Use gold preprocessing.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --learn-tokens], #[code -T]
|
||||||
|
+cell flag
|
||||||
|
+cell
|
||||||
|
| Make parser learn gold-standard tokenization by merging
|
||||||
|
] subtokens. Typically used for languages like Chinese.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --verbose], #[code -VV]
|
||||||
|
+tag-new("2.0.13")
|
||||||
|
+cell flag
|
||||||
|
+cell Show more detailed messages during training.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --help], #[code -h]
|
+cell #[code --help], #[code -h]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code --verbose]
|
|
||||||
+tag-new("2.0.13")
|
|
||||||
+cell flag
|
|
||||||
+cell Show more detail message during training.
|
|
||||||
|
|
||||||
+row("foot")
|
+row("foot")
|
||||||
+cell creates
|
+cell creates
|
||||||
+cell model, pickle
|
+cell model, pickle
|
||||||
+cell A spaCy model on each epoch, and a final #[code .pickle] file.
|
+cell A spaCy model on each epoch.
|
||||||
|
|
||||||
+h(4, "train-hyperparams") Environment variables for hyperparameters
|
+h(4, "train-hyperparams") Environment variables for hyperparameters
|
||||||
+tag-new(2)
|
+tag-new(2)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user