💫 New JSON helpers, training data internals & CLI rewrite (#2932)

* Support nowrap setting in util.prints

* Tidy up and fix whitespace

* Simplify script and use read_jsonl helper

* Add JSON schemas (see #2928)

* Deprecate Doc.print_tree

Will be replaced with Doc.to_json, which will produce a unified format

* Add Doc.to_json() method (see #2928)

Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space.

* Remove outdated test

* Add write_json and write_jsonl helpers

* WIP: Update spacy train

* Tidy up spacy train

* WIP: Use wasabi for formatting

* Add GoldParse helpers for JSON format

* WIP: add debug-data command

* Fix typo

* Add missing import

* Update wasabi pin

* Add missing import

* 💫 Refactor CLI (#2943)

To be merged into #2932.

## Description
- [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi)
- [x] use [`black`](https://github.com/ambv/black) for auto-formatting
- [x] add `flake8` config
- [x] move all messy UD-related scripts to `cli.ud`
- [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO)

### Types of change
enhancement

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

* Update wasabi pin

* Delete old test

* Update errors

* Fix typo

* Tidy up and format remaining code

* Fix formatting

* Improve formatting of messages

* Auto-format remaining code

* Add tok2vec stuff to spacy.train

* Fix typo

* Update wasabi pin

* Fix path checks for when train() is called as function

* Reformat and tidy up pretrain script

* Update argument annotations

* Raise error if model language doesn't match lang

* Document new train command
This commit is contained in:
Ines Montani 2018-11-30 20:16:14 +01:00 committed by Matthew Honnibal
parent 0369db75c1
commit 37c7c85a86
46 changed files with 2476 additions and 1539 deletions

View File

@ -11,6 +11,8 @@ ujson>=1.35
dill>=0.2,<0.3 dill>=0.2,<0.3
regex==2018.01.10 regex==2018.01.10
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
jsonschema>=2.6.0,<3.0.0
wasabi>=0.0.8,<1.1.0
pathlib==1.0.1; python_version < "3.4" pathlib==1.0.1; python_version < "3.4"
# Development dependencies # Development dependencies
pytest>=4.0.0,<5.0.0 pytest>=4.0.0,<5.0.0

View File

@ -207,6 +207,8 @@ def setup_package():
"regex==2018.01.10", "regex==2018.01.10",
"dill>=0.2,<0.3", "dill>=0.2,<0.3",
"requests>=2.13.0,<3.0.0", "requests>=2.13.0,<3.0.0",
"jsonschema>=2.6.0,<3.0.0",
"wasabi>=0.0.8,<1.1.0",
'pathlib==1.0.1; python_version < "3.4"', 'pathlib==1.0.1; python_version < "3.4"',
], ],
setup_requires=["wheel"], setup_requires=["wheel"],

View File

@ -1,40 +1,41 @@
# coding: utf8 # coding: utf8
from __future__ import print_function from __future__ import print_function
# NB! This breaks in plac on Python 2!! # NB! This breaks in plac on Python 2!!
# from __future__ import unicode_literals # from __future__ import unicode_literals
if __name__ == '__main__': if __name__ == "__main__":
import plac import plac
import sys import sys
from wasabi import Printer
from spacy.cli import download, link, info, package, train, pretrain, convert from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import vocab, init_model, profile, evaluate, validate from spacy.cli import init_model, profile, evaluate, validate
from spacy.cli import ud_train, ud_evaluate from spacy.cli import ud_train, ud_evaluate, debug_data
from spacy.util import prints
msg = Printer()
commands = { commands = {
'download': download, "download": download,
'link': link, "link": link,
'info': info, "info": info,
'train': train, "train": train,
'pretrain': pretrain, "pretrain": pretrain,
'ud-train': ud_train, "debug-data": debug_data,
'evaluate': evaluate, "ud-train": ud_train,
'ud-evaluate': ud_evaluate, "evaluate": evaluate,
'convert': convert, "ud-evaluate": ud_evaluate,
'package': package, "convert": convert,
'vocab': vocab, "package": package,
'init-model': init_model, "init-model": init_model,
'profile': profile, "profile": profile,
'validate': validate "validate": validate,
} }
if len(sys.argv) == 1: if len(sys.argv) == 1:
prints(', '.join(commands), title="Available commands", exits=1) msg.info("Available commands", ", ".join(commands), exits=1)
command = sys.argv.pop(1) command = sys.argv.pop(1)
sys.argv[0] = 'spacy %s' % command sys.argv[0] = "spacy %s" % command
if command in commands: if command in commands:
plac.call(commands[command], sys.argv[1:]) plac.call(commands[command], sys.argv[1:])
else: else:
prints( available = "Available: {}".format(", ".join(commands))
"Available: %s" % ', '.join(commands), msg.fail("Unknown command: {}".format(command), available, exits=1)
title="Unknown command: %s" % command,
exits=1)

View File

@ -1,14 +1,13 @@
from .download import download from .download import download # noqa: F401
from .info import info from .info import info # noqa: F401
from .link import link from .link import link # noqa: F401
from .package import package from .package import package # noqa: F401
from .profile import profile from .profile import profile # noqa: F401
from .train import train from .train import train # noqa: F401
from .pretrain import pretrain from .pretrain import pretrain # noqa: F401
from .evaluate import evaluate from .debug_data import debug_data # noqa: F401
from .convert import convert from .evaluate import evaluate # noqa: F401
from .vocab import make_vocab as vocab from .convert import convert # noqa: F401
from .init_model import init_model from .init_model import init_model # noqa: F401
from .validate import validate from .validate import validate # noqa: F401
from .ud_train import main as ud_train from .ud import ud_train, ud_evaluate # noqa: F401
from .conll17_ud_eval import main as ud_evaluate

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
# fmt: off
class Messages(object): class Messages(object):
M001 = ("Download successful but linking failed") M001 = ("Download successful but linking failed")
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
@ -73,3 +75,31 @@ class Messages(object):
M052 = ("Not a valid meta.json format") M052 = ("Not a valid meta.json format")
M053 = ("Expected dict but got: {meta_type}") M053 = ("Expected dict but got: {meta_type}")
M054 = ("No --lang specified, but tokenization required.") M054 = ("No --lang specified, but tokenization required.")
M055 = ("Training pipeline: {pipeline}")
M056 = ("Starting with base model '{model}'")
M057 = ("Starting with blank model '{model}'")
M058 = ("Loading vector from model '{model}'")
M059 = ("Can't use multitask objective without '{pipe}' in the pipeline")
M060 = ("Counting training words (limit={limit})")
M061 = ("\nSaving model...")
M062 = ("Output directory is not empty.")
M063 = ("Incompatible arguments")
M064 = ("The -f and -c arguments are deprecated, and not compatible with "
"the -j argument, which should specify the same information. "
"Either merge the frequencies and clusters data into the "
"JSONL-formatted file (recommended), or use only the -f and -c "
"files, without the other lexical attributes.")
M065 = ("This can lead to unintended side effects when saving the model. "
"Please use an empty directory or a different path instead. If "
"the specified output path doesn't exist, the directory will be "
"created for you.")
M066 = ("Saved model to output directory")
M067 = ("Can't find lexical data")
M068 = ("Sucessfully compiled vocab and vectors, and saved model")
M069 = ("Unknown file type: '{name}'")
M070 = ("Supported file types: '{options}'")
M071 = ("Loaded pretrained tok2vec for: {components}")
M072 = ("Model language ('{model_lang}') doesn't match language specified "
"as `lang` argument ('{lang}') ")
# fmt: on

View File

@ -3,49 +3,91 @@ from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
from wasabi import Printer
from ..util import write_jsonl, write_json
from ..compat import json_dumps, path2str
from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json
from .converters import ner_jsonl2json from .converters import ner_jsonl2json
from ._messages import Messages from ._messages import Messages
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new # Converters are matched by file extension. To add a converter, add a new
# entry to this dict with the file extension mapped to the converter function # entry to this dict with the file extension mapped to the converter function
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
'conllubio': conllubio2json, "conllubio": conllubio2json,
'conllu': conllu2json, "conllu": conllu2json,
'conll': conllu2json, "conll": conllu2json,
'ner': conll_ner2json, "ner": conll_ner2json,
'iob': iob2json, "iob": iob2json,
'jsonl': ner_jsonl2json "jsonl": ner_jsonl2json,
} }
# File types
FILE_TYPES = ("json", "jsonl")
@plac.annotations( @plac.annotations(
input_file=("input file", "positional", None, str), input_file=("Input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str), output_dir=("Output directory for converted file", "positional", None, str),
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
n_sents=("Number of sentences per doc", "option", "n", int), n_sents=("Number of sentences per doc", "option", "n", int),
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str), lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool)) morphology=("Enable appending morphology to tags", "flag", "m", bool),
def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto', )
lang=None): def convert(
input_file,
output_dir="-",
file_type="jsonl",
n_sents=1,
morphology=False,
converter="auto",
lang=None,
):
""" """
Convert files into JSON format for use with train command and other Convert files into JSON format for use with train command and other
experiment management functions. experiment management functions. If no output_dir is specified, the data
is written to stdout, so you can pipe them forward to a JSONL file:
$ spacy convert some_file.conllu > some_file.jsonl
""" """
msg = Printer()
input_path = Path(input_file) input_path = Path(input_file)
output_path = Path(output_dir) if file_type not in FILE_TYPES:
msg.fail(
Messages.M069.format(name=file_type),
Messages.M070.format(options=", ".join(FILE_TYPES)),
exits=1,
)
if not input_path.exists(): if not input_path.exists():
prints(input_path, title=Messages.M028, exits=1) msg.fail(Messages.M028, input_path, exits=1)
if not output_path.exists(): if output_dir != "-" and not Path(output_dir).exists():
prints(output_path, title=Messages.M029, exits=1) msg.fail(Messages.M029, output_dir, exits=1)
if converter == 'auto': if converter == "auto":
converter = input_path.suffix[1:] converter = input_path.suffix[1:]
if converter not in CONVERTERS: if converter not in CONVERTERS:
prints(Messages.M031.format(converter=converter), msg.fail(Messages.M030, Messages.M031.format(converter=converter), exits=1)
title=Messages.M030, exits=1) # Use converter function to convert data
func = CONVERTERS[converter] func = CONVERTERS[converter]
func(input_path, output_path, input_data = input_path.open("r", encoding="utf-8").read()
n_sents=n_sents, use_morphology=morphology, lang=lang) data = func(input_data, nsents=n_sents, use_morphology=morphology, lang=lang)
if output_dir != "-":
# Export data to a file
suffix = ".{}".format(file_type)
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
if file_type == "json":
write_json(output_file, data)
elif file_type == "jsonl":
write_jsonl(output_file, data)
msg.good(
Messages.M032.format(name=path2str(output_file)),
Messages.M033.format(n_docs=len(data)),
)
else:
# Print to stdout
if file_type == "json":
print(json_dumps(data))
elif file_type == "jsonl":
for line in data:
print(json_dumps(line))

View File

@ -1,5 +1,5 @@
from .conllu2json import conllu2json from .conllu2json import conllu2json # noqa: F401
from .conllubio2json import conllubio2json from .conllubio2json import conllubio2json # noqa: F401
from .iob2json import iob2json from .iob2json import iob2json # noqa: F401
from .conll_ner2json import conll_ner2json from .conll_ner2json import conll_ner2json # noqa: F401
from .jsonl2json import ner_jsonl2json from .jsonl2json import ner_jsonl2json # noqa: F401

View File

@ -1,52 +1,38 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None): def conll_ner2json(input_data, **kwargs):
""" """
Convert files in the CoNLL-2003 NER format into JSON format for use with Convert files in the CoNLL-2003 NER format into JSON format for use with
train cli. train cli.
""" """
docs = read_conll_ner(input_path) delimit_docs = "-DOCSTART- -X- O O"
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints(Messages.M033.format(n_docs=len(docs)),
title=Messages.M032.format(name=path2str(output_file)))
def read_conll_ner(input_path):
text = input_path.open('r', encoding='utf-8').read()
i = 0
delimit_docs = '-DOCSTART- -X- O O'
output_docs = [] output_docs = []
for doc in text.strip().split(delimit_docs): for doc in input_data.strip().split(delimit_docs):
doc = doc.strip() doc = doc.strip()
if not doc: if not doc:
continue continue
output_doc = [] output_doc = []
for sent in doc.split('\n\n'): for sent in doc.split("\n\n"):
sent = sent.strip() sent = sent.strip()
if not sent: if not sent:
continue continue
lines = [line.strip() for line in sent.split('\n') if line.strip()] lines = [line.strip() for line in sent.split("\n") if line.strip()]
words, tags, chunks, iob_ents = zip(*[line.split() for line in lines]) words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
biluo_ents = iob_to_biluo(iob_ents) biluo_ents = iob_to_biluo(iob_ents)
output_doc.append({'tokens': [ output_doc.append(
{'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in {
zip(words, tags, biluo_ents) "tokens": [
]}) {"orth": w, "tag": tag, "ner": ent}
output_docs.append({ for (w, tag, ent) in zip(words, tags, biluo_ents)
'id': len(output_docs), ]
'paragraphs': [{'sentences': output_doc}] }
}) )
output_docs.append(
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
)
output_doc = [] output_doc = []
return output_docs return output_docs

View File

@ -1,34 +1,27 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
import re import re
from ...gold import iob_to_biluo
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None):
""" """
Convert conllu files into JSON format for use with train cli. Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is use_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich. useful for languages such as Spanish, where UD tags are not so rich.
"""
# by @dvsrepo, via #11 explosion/spacy-dev-resources
"""
Extract NER tags if available and convert them so that they follow Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme BILUO and the Wikipedia scheme
""" """
# by @dvsrepo, via #11 explosion/spacy-dev-resources
# by @katarkor # by @katarkor
docs = [] docs = []
sentences = [] sentences = []
conll_tuples = read_conllx(input_path, use_morphology=use_morphology) conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
checked_for_ner = False checked_for_ner = False
has_ner_tags = False has_ner_tags = False
for i, (raw_text, tokens) in enumerate(conll_tuples): for i, (raw_text, tokens) in enumerate(conll_tuples):
sentence, brackets = tokens[0] sentence, brackets = tokens[0]
if not checked_for_ner: if not checked_for_ner:
@ -37,29 +30,19 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False, lang=
sentences.append(generate_sentence(sentence, has_ner_tags)) sentences.append(generate_sentence(sentence, has_ner_tags))
# Real-sized documents could be extracted using the comments on the # Real-sized documents could be extracted using the comments on the
# conluu document # conluu document
if len(sentences) % n_sents == 0:
if(len(sentences) % n_sents == 0):
doc = create_doc(sentences, i) doc = create_doc(sentences, i)
docs.append(doc) docs.append(doc)
sentences = [] sentences = []
return docs
output_filename = input_path.parts[-1].replace(".conll", ".json")
output_filename = input_path.parts[-1].replace(".conllu", ".json")
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints(Messages.M033.format(n_docs=len(docs)),
title=Messages.M032.format(name=path2str(output_file)))
def is_ner(tag): def is_ner(tag):
""" """
Check the 10th column of the first token to determine if the file contains Check the 10th column of the first token to determine if the file contains
NER tags NER tags
""" """
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag)
if tag_match: if tag_match:
return True return True
elif tag == "O": elif tag == "O":
@ -67,29 +50,29 @@ def is_ner(tag):
else: else:
return False return False
def read_conllx(input_path, use_morphology=False, n=0):
text = input_path.open('r', encoding='utf-8').read() def read_conllx(input_data, use_morphology=False, n=0):
i = 0 i = 0
for sent in text.strip().split('\n\n'): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split('\n') lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith('#'): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
tokens = [] tokens = []
for line in lines: for line in lines:
parts = line.split('\t') parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
if '-' in id_ or '.' in id_: if "-" in id_ or "." in id_:
continue continue
try: try:
id_ = int(id_) - 1 id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_ head = (int(head) - 1) if head != "0" else id_
dep = 'ROOT' if dep == 'root' else dep dep = "ROOT" if dep == "root" else dep
tag = pos if tag == '_' else tag tag = pos if tag == "_" else tag
tag = tag+'__'+morph if use_morphology else tag tag = tag + "__" + morph if use_morphology else tag
tokens.append((id_, word, tag, head, dep, iob)) tokens.append((id_, word, tag, head, dep, iob))
except: except: # noqa: E722
print(line) print(line)
raise raise
tuples = [list(t) for t in zip(*tokens)] tuples = [list(t) for t in zip(*tokens)]
@ -98,31 +81,31 @@ def read_conllx(input_path, use_morphology=False, n=0):
if n >= 1 and i >= n: if n >= 1 and i >= n:
break break
def simplify_tags(iob):
def simplify_tags(iob):
""" """
Simplify tags obtained from the dataset in order to follow Wikipedia Simplify tags obtained from the dataset in order to follow Wikipedia
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to 'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
'MISC'. 'MISC'.
""" """
new_iob = [] new_iob = []
for tag in iob: for tag in iob:
tag_match = re.match('([A-Z_]+)-([A-Z_]+)', tag) tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
if tag_match: if tag_match:
prefix = tag_match.group(1) prefix = tag_match.group(1)
suffix = tag_match.group(2) suffix = tag_match.group(2)
if suffix == 'GPE_LOC': if suffix == "GPE_LOC":
suffix = 'LOC' suffix = "LOC"
elif suffix == 'GPE_ORG': elif suffix == "GPE_ORG":
suffix = 'ORG' suffix = "ORG"
elif suffix != 'PER' and suffix != 'LOC' and suffix != 'ORG': elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
suffix = 'MISC' suffix = "MISC"
tag = prefix + '-' + suffix tag = prefix + "-" + suffix
new_iob.append(tag) new_iob.append(tag)
return new_iob return new_iob
def generate_sentence(sent, has_ner_tags): def generate_sentence(sent, has_ner_tags):
(id_, word, tag, head, dep, iob) = sent (id_, word, tag, head, dep, iob) = sent
sentence = {} sentence = {}
@ -144,7 +127,7 @@ def generate_sentence(sent, has_ner_tags):
return sentence return sentence
def create_doc(sentences,id): def create_doc(sentences, id):
doc = {} doc = {}
paragraph = {} paragraph = {}
doc["id"] = id doc["id"] = id

View File

@ -1,65 +1,54 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
def conllubio2json(input_path, output_path, n_sents=10, use_morphology=False, lang=None):
def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None):
""" """
Convert conllu files into JSON format for use with train cli. Convert conllu files into JSON format for use with train cli.
use_morphology parameter enables appending morphology to tags, which is use_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich. useful for languages such as Spanish, where UD tags are not so rich.
""" """
# by @dvsrepo, via #11 explosion/spacy-dev-resources # by @dvsrepo, via #11 explosion/spacy-dev-resources
docs = [] docs = []
sentences = [] sentences = []
conll_tuples = read_conllx(input_path, use_morphology=use_morphology) conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
for i, (raw_text, tokens) in enumerate(conll_tuples): for i, (raw_text, tokens) in enumerate(conll_tuples):
sentence, brackets = tokens[0] sentence, brackets = tokens[0]
sentences.append(generate_sentence(sentence)) sentences.append(generate_sentence(sentence))
# Real-sized documents could be extracted using the comments on the # Real-sized documents could be extracted using the comments on the
# conluu document # conluu document
if(len(sentences) % n_sents == 0): if len(sentences) % n_sents == 0:
doc = create_doc(sentences, i) doc = create_doc(sentences, i)
docs.append(doc) docs.append(doc)
sentences = [] sentences = []
return docs
output_filename = input_path.parts[-1].replace(".conll", ".json")
output_filename = input_path.parts[-1].replace(".conllu", ".json")
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
def read_conllx(input_path, use_morphology=False, n=0): def read_conllx(input_data, use_morphology=False, n=0):
text = input_path.open('r', encoding='utf-8').read()
i = 0 i = 0
for sent in text.strip().split('\n\n'): for sent in input_data.strip().split("\n\n"):
lines = sent.strip().split('\n') lines = sent.strip().split("\n")
if lines: if lines:
while lines[0].startswith('#'): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
tokens = [] tokens = []
for line in lines: for line in lines:
parts = line.split('\t') parts = line.split("\t")
id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts
if '-' in id_ or '.' in id_: if "-" in id_ or "." in id_:
continue continue
try: try:
id_ = int(id_) - 1 id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_ head = (int(head) - 1) if head != "0" else id_
dep = 'ROOT' if dep == 'root' else dep dep = "ROOT" if dep == "root" else dep
tag = pos if tag == '_' else tag tag = pos if tag == "_" else tag
tag = tag+'__'+morph if use_morphology else tag tag = tag + "__" + morph if use_morphology else tag
ner = ner if ner else 'O' ner = ner if ner else "O"
tokens.append((id_, word, tag, head, dep, ner)) tokens.append((id_, word, tag, head, dep, ner))
except: except: # noqa: E722
print(line) print(line)
raise raise
tuples = [list(t) for t in zip(*tokens)] tuples = [list(t) for t in zip(*tokens)]
@ -68,6 +57,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
if n >= 1 and i >= n: if n >= 1 and i >= n:
break break
def generate_sentence(sent): def generate_sentence(sent):
(id_, word, tag, head, dep, ner) = sent (id_, word, tag, head, dep, ner) = sent
sentence = {} sentence = {}
@ -85,7 +75,7 @@ def generate_sentence(sent):
return sentence return sentence
def create_doc(sentences,id): def create_doc(sentences, id):
doc = {} doc = {}
paragraph = {} paragraph = {}
doc["id"] = id doc["id"] = id

View File

@ -1,26 +1,24 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from cytoolz import partition_all, concat
from .._messages import Messages from cytoolz import partition_all
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
def iob2json(input_path, output_path, n_sents=10, *a, **k): def iob2json(input_data, n_sents=10, *args, **kwargs):
""" """
Convert IOB files into JSON format for use with train cli. Convert IOB files into JSON format for use with train cli.
""" """
with input_path.open('r', encoding='utf8') as file_: docs = []
sentences = read_iob(file_) for group in partition_all(n_sents, docs):
docs = merge_sentences(sentences, n_sents) group = list(group)
output_filename = input_path.parts[-1].replace(".iob", ".json") first = group.pop(0)
output_file = output_path / output_filename to_extend = first["paragraphs"][0]["sentences"]
with output_file.open('w', encoding='utf-8') as f: for sent in group[1:]:
f.write(json_dumps(docs)) to_extend.extend(sent["paragraphs"][0]["sentences"])
prints(Messages.M033.format(n_docs=len(docs)), docs.append(first)
title=Messages.M032.format(name=path2str(output_file))) return docs
def read_iob(raw_sents): def read_iob(raw_sents):
@ -28,30 +26,20 @@ def read_iob(raw_sents):
for line in raw_sents: for line in raw_sents:
if not line.strip(): if not line.strip():
continue continue
tokens = [t.split('|') for t in line.split()] tokens = [t.split("|") for t in line.split()]
if len(tokens[0]) == 3: if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens) words, pos, iob = zip(*tokens)
else: else:
words, iob = zip(*tokens) words, iob = zip(*tokens)
pos = ['-'] * len(words) pos = ["-"] * len(words)
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
sentences.append([ sentences.append(
{'orth': w, 'tag': p, 'ner': ent} [
{"orth": w, "tag": p, "ner": ent}
for (w, p, ent) in zip(words, pos, biluo) for (w, p, ent) in zip(words, pos, biluo)
]) ]
sentences = [{'tokens': sent} for sent in sentences] )
paragraphs = [{'sentences': [sent]} for sent in sentences] sentences = [{"tokens": sent} for sent in sentences]
docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] paragraphs = [{"sentences": [sent]} for sent in sentences]
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
return docs return docs
def merge_sentences(docs, n_sents):
counter = 0
merged = []
for group in partition_all(n_sents, docs):
group = list(group)
first = group.pop(0)
to_extend = first['paragraphs'][0]['sentences']
for sent in group[1:]:
to_extend.extend(sent['paragraphs'][0]['sentences'])
merged.append(first)
return merged

View File

@ -1,33 +1,21 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson as json
import ujson
from ...util import get_lang_class
from .._messages import Messages from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints, get_lang_class
from ...gold import docs_to_json
def ner_jsonl2json(input_path, output_path, lang=None, n_sents=10, use_morphology=False): def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
if lang is None: if lang is None:
prints(Messages.M054, exits=True) raise ValueError(Messages.M054)
json_docs = [] json_docs = []
input_tuples = list(read_jsonl(input_path)) input_tuples = [ujson.loads(line) for line in input_data]
nlp = get_lang_class(lang)() nlp = get_lang_class(lang)()
for i, (raw_text, ents) in enumerate(input_tuples): for i, (raw_text, ents) in enumerate(input_tuples):
doc = nlp.make_doc(raw_text) doc = nlp.make_doc(raw_text)
doc[0].is_sent_start = True doc[0].is_sent_start = True
doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents['entities']] doc.ents = [doc.char_span(s, e, label=L) for s, e, L in ents["entities"]]
json_docs.append(docs_to_json(i, [doc])) json_docs.append(doc.to_json())
return json_docs
output_filename = input_path.parts[-1].replace(".jsonl", ".json")
output_loc = output_path / output_filename
with (output_loc).open('w', encoding='utf8') as file_:
file_.write(json_dumps(json_docs))
prints(Messages.M033.format(n_docs=len(json_docs)),
title=Messages.M032.format(name=path2str(output_loc)))
def read_jsonl(input_path):
with input_path.open('r', encoding='utf8') as file_:
for line in file_:
yield json.loads(line)

398
spacy/cli/debug_data.py Normal file
View File

@ -0,0 +1,398 @@
# coding: utf8
from __future__ import unicode_literals, print_function
from pathlib import Path
from collections import Counter
import plac
import sys
from wasabi import Printer, MESSAGES
from ..gold import GoldCorpus, read_json_object
from ..util import load_model, get_lang_class, read_json, read_jsonl
# from .schemas import get_schema, validate_json
from ._messages import Messages
# Minimum number of expected occurences of label in data to train new label
NEW_LABEL_THRESHOLD = 50
# Minimum number of expected examples to train a blank model
BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000
@plac.annotations(
lang=("model language", "positional", None, str),
train_path=("location of JSON-formatted training data", "positional", None, Path),
dev_path=("location of JSON-formatted development data", "positional", None, Path),
base_model=("name of model to update (optional)", "option", "b", str),
pipeline=(
"Comma-separated names of pipeline components to train",
"option",
"p",
str,
),
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
ignore_validation=(
"Don't exit if JSON format validation fails",
"flag",
"IV",
bool,
),
verbose=("Print additional information and explanations", "flag", "V", bool),
no_format=("Don't pretty-print the results", "flag", "NF", bool),
)
def debug_data(
lang,
train_path,
dev_path,
base_model=None,
pipeline="tagger,parser,ner",
ignore_warnings=False,
ignore_validation=False,
verbose=False,
no_format=False,
):
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
# Make sure all files and paths exists if they are needed
if not train_path.exists():
msg.fail(Messages.M050, train_path, exits=1)
if not dev_path.exists():
msg.fail(Messages.M051, dev_path, exits=1)
# Initialize the model and pipeline
pipeline = [p.strip() for p in pipeline.split(",")]
if base_model:
nlp = load_model(base_model)
else:
lang_cls = get_lang_class(lang)
nlp = lang_cls()
msg.divider("Data format validation")
# Load the data in one might take a while but okay in this case
with msg.loading("Loading {}...".format(train_path.parts[-1])):
train_data = _load_file(train_path, msg)
with msg.loading("Loading {}...".format(dev_path.parts[-1])):
dev_data = _load_file(dev_path, msg)
# Validate data format using the JSON schema
# TODO: update once the new format is ready
# schema = get_schema("training")
train_data_errors = [] # TODO: validate_json(train_data, schema)
dev_data_errors = [] # TODO: validate_json(dev_data, schema)
if not train_data_errors:
msg.good("Training data JSON format is valid")
if not dev_data_errors:
msg.good("Development data JSON format is valid")
for error in train_data_errors:
msg.fail("Training data: {}".format(error))
for error in dev_data_errors:
msg.fail("Develoment data: {}".format(error))
if (train_data_errors or dev_data_errors) and not ignore_validation:
sys.exit(1)
# Create the gold corpus to be able to better analyze data
with msg.loading("Analyzing corpus..."):
train_data = read_json_object(train_data)
dev_data = read_json_object(dev_data)
corpus = GoldCorpus(train_data, dev_data)
train_docs = list(corpus.train_docs(nlp))
dev_docs = list(corpus.dev_docs(nlp))
msg.good("Corpus is loadable")
# Create all gold data here to avoid iterating over the train_docs constantly
gold_data = _compile_gold(train_docs, pipeline)
train_texts = gold_data["texts"]
dev_texts = set([doc.text for doc, gold in dev_docs])
msg.divider("Training stats")
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
for pipe in [p for p in pipeline if p not in nlp.factories]:
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
if base_model:
msg.text("Starting with base model '{}'".format(base_model))
else:
msg.text("Starting with blank model '{}'".format(lang))
msg.text("{} training docs".format(len(train_docs)))
msg.text("{} evaluation docs".format(len(dev_docs)))
overlap = len(train_texts.intersection(dev_texts))
if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap))
else:
msg.good("No overlap between training and evaluation data")
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
text = "Low number of examples to train from a blank model ({})".format(
len(train_docs)
)
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
msg.fail(text)
else:
msg.warn(text)
msg.text(
"It's recommended to use at least {} examples (minimum {})".format(
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
),
show=verbose,
)
msg.divider("Vocab & Vectors")
n_words = gold_data["n_words"]
msg.info(
"{} total {} in the data ({} unique)".format(
n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
)
)
most_common_words = gold_data["words"].most_common(10)
msg.text(
"10 most common words: {}".format(
_format_labels(most_common_words, counts=True)
),
show=verbose,
)
if len(nlp.vocab.vectors):
msg.info(
"{} vectors ({} unique keys, {} dimensions)".format(
len(nlp.vocab.vectors),
nlp.vocab.vectors.n_keys,
nlp.vocab.vectors_length,
)
)
else:
msg.info("No word vectors present in the model")
if "ner" in pipeline:
# Get all unique NER labels present in the data
labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
label_counts = gold_data["ner"]
model_labels = _get_labels_from_model(nlp, "ner")
new_labels = [l for l in labels if l not in model_labels]
existing_labels = [l for l in labels if l in model_labels]
has_low_data_warning = False
has_no_neg_warning = False
msg.divider("Named Entity Recognition")
msg.info(
"{} new {}, {} existing {}".format(
len(new_labels),
"label" if len(new_labels) == 1 else "labels",
len(existing_labels),
"label" if len(existing_labels) == 1 else "labels",
)
)
missing_values = label_counts["-"]
msg.text(
"{} missing {} (tokens with '-' label)".format(
missing_values, "value" if missing_values == 1 else "values"
)
)
if new_labels:
labels_with_counts = [
(label, count)
for label, count in label_counts.most_common()
if label != "-"
]
labels_with_counts = _format_labels(labels_with_counts, counts=True)
msg.text("New: {}".format(labels_with_counts), show=verbose)
if existing_labels:
msg.text(
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
)
for label in new_labels:
if label_counts[label] <= NEW_LABEL_THRESHOLD:
msg.warn(
"Low number of examples for new label '{}' ({})".format(
label, label_counts[label]
)
)
has_low_data_warning = True
with msg.loading("Analyzing label distribution..."):
neg_docs = _get_examples_without_label(train_docs, label)
if neg_docs == 0:
msg.warn(
"No examples for texts WITHOUT new label '{}'".format(label)
)
has_no_neg_warning = True
if not has_low_data_warning:
msg.good("Good amount of examples for all labels")
if not has_no_neg_warning:
msg.good("Examples without occurences available for all labels")
if has_low_data_warning:
msg.text(
"To train a new entity type, your data should include at "
"least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
show=verbose,
)
if has_no_neg_warning:
msg.text(
"Training data should always include examples of entities "
"in context, as well as examples without a given entity "
"type.",
show=verbose,
)
if "textcat" in pipeline:
msg.divider("Text Classification")
labels = [label for label in gold_data["textcat"]]
model_labels = _get_labels_from_model(nlp, "textcat")
new_labels = [l for l in labels if l not in model_labels]
existing_labels = [l for l in labels if l in model_labels]
msg.info(
"Text Classification: {} new label(s), {} existing label(s)".format(
len(new_labels), len(existing_labels)
)
)
if new_labels:
labels_with_counts = _format_labels(
gold_data["textcat"].most_common(), counts=True
)
msg.text("New: {}".format(labels_with_counts), show=verbose)
if existing_labels:
msg.text(
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
)
if "tagger" in pipeline:
msg.divider("Part-of-speech Tagging")
labels = [label for label in gold_data["tags"]]
tag_map = nlp.Defaults.tag_map
msg.info(
"{} {} in data ({} {} in tag map)".format(
len(labels),
"label" if len(labels) == 1 else "labels",
len(tag_map),
"label" if len(tag_map) == 1 else "labels",
)
)
labels_with_counts = _format_labels(
gold_data["tags"].most_common(), counts=True
)
msg.text(labels_with_counts, show=verbose)
non_tagmap = [l for l in labels if l not in tag_map]
if not non_tagmap:
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
for label in non_tagmap:
msg.fail(
"Label '{}' not found in tag map for language '{}'".format(
label, nlp.lang
)
)
if "parser" in pipeline:
msg.divider("Dependency Parsing")
labels = [label for label in gold_data["deps"]]
msg.info(
"{} {} in data".format(
len(labels), "label" if len(labels) == 1 else "labels"
)
)
labels_with_counts = _format_labels(
gold_data["deps"].most_common(), counts=True
)
msg.text(labels_with_counts, show=verbose)
msg.divider("Summary")
good_counts = msg.counts[MESSAGES.GOOD]
warn_counts = msg.counts[MESSAGES.WARN]
fail_counts = msg.counts[MESSAGES.FAIL]
if good_counts:
msg.good(
"{} {} passed".format(
good_counts, "check" if good_counts == 1 else "checks"
)
)
if warn_counts:
msg.warn(
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
)
if fail_counts:
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
if fail_counts:
sys.exit(1)
def _load_file(file_path, msg):
file_name = file_path.parts[-1]
if file_path.suffix == ".json":
data = read_json(file_path)
msg.good("Loaded {}".format(file_name))
return data
elif file_path.suffix == ".jsonl":
data = read_jsonl(file_path)
msg.good("Loaded {}".format(file_name))
return data
msg.fail(
"Can't load file extension {}".format(file_path.suffix),
"Expected .json or .jsonl",
exits=1,
)
def _compile_gold(train_docs, pipeline):
data = {
"ner": Counter(),
"cats": Counter(),
"tags": Counter(),
"deps": Counter(),
"words": Counter(),
"n_words": 0,
"texts": set(),
}
for doc, gold in train_docs:
data["words"].update(gold.words)
data["n_words"] += len(gold.words)
data["texts"].add(doc.text)
if "ner" in pipeline:
for label in gold.ner:
if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1]
data["ner"][combined_label] += 1
elif label == "-":
data["ner"]["-"] += 1
if "textcat" in pipeline:
data["cats"].update(gold.cats)
if "tagger" in pipeline:
data["tags"].update(gold.tags)
if "parser" in pipeline:
data["deps"].update(gold.labels)
return data
def _format_labels(labels, counts=False):
if counts:
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
return ", ".join(["'{}'".format(l) for l in labels])
def _get_ner_counts(data):
counter = Counter()
for doc, gold in data:
for label in gold.ner:
if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1]
counter[combined_label] += 1
elif label == "-":
counter["-"] += 1
return counter
def _get_examples_without_label(data, label):
count = 0
for doc, gold in data:
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
if label not in labels:
count += 1
return count
def _get_labels_from_model(nlp, pipe_name):
if pipe_name not in nlp.pipe_names:
return set()
pipe = nlp.get_pipe(pipe_name)
return pipe.labels

View File

@ -6,34 +6,37 @@ import requests
import os import os
import subprocess import subprocess
import sys import sys
from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from .link import link from .link import link
from ..util import prints, get_package_path from ..util import get_package_path
from .. import about from .. import about
msg = Printer()
@plac.annotations( @plac.annotations(
model=("model to download, shortcut or name", "positional", None, str), model=("Model to download (shortcut or name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't " direct=("Force direct download of name + version", "flag", "d", bool),
"perform compatibility check", "flag", "d", bool), pip_args=("additional arguments to be passed to `pip install` on model install"),
pip_args=("additional arguments to be passed to `pip install` when " )
"installing the model"))
def download(model, direct=False, *pip_args): def download(model, direct=False, *pip_args):
""" """
Download compatible model from default download path using pip. Model Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name
with version. with version. For direct downloads, the compatibility check will be skipped.
""" """
if direct: if direct:
dl = download_model('{m}/{m}.tar.gz#egg={m}'.format(m=model), pip_args) dl = download_model("{m}/{m}.tar.gz#egg={m}".format(m=model), pip_args)
else: else:
shortcuts = get_json(about.__shortcuts__, "available shortcuts") shortcuts = get_json(about.__shortcuts__, "available shortcuts")
model_name = shortcuts.get(model, model) model_name = shortcuts.get(model, model)
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
dl = download_model('{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}' dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
.format(m=model_name, v=version), pip_args) dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
if dl != 0: # if download subprocess doesn't return 0, exit if dl != 0: # if download subprocess doesn't return 0, exit
sys.exit(dl) sys.exit(dl)
try: try:
@ -43,44 +46,49 @@ def download(model, direct=False, *pip_args):
# subprocess # subprocess
package_path = get_package_path(model_name) package_path = get_package_path(model_name)
link(model_name, model, force=True, model_path=package_path) link(model_name, model, force=True, model_path=package_path)
except: except: # noqa: E722
# Dirty, but since spacy.download and the auto-linking is # Dirty, but since spacy.download and the auto-linking is
# mostly a convenience wrapper, it's best to show a success # mostly a convenience wrapper, it's best to show a success
# message and loading instructions, even if linking fails. # message and loading instructions, even if linking fails.
prints(Messages.M001, title=Messages.M002.format(name=model_name)) msg.warn(Messages.M002.format(name=model_name), Messages.M001)
def get_json(url, desc): def get_json(url, desc):
r = requests.get(url) r = requests.get(url)
if r.status_code != 200: if r.status_code != 200:
prints(Messages.M004.format(desc=desc, version=about.__version__), msg.fail(
title=Messages.M003.format(code=r.status_code), exits=1) Messages.M003.format(code=r.status_code),
Messages.M004.format(desc=desc, version=about.__version__),
exits=1,
)
return r.json() return r.json()
def get_compatibility(): def get_compatibility():
version = about.__version__ version = about.__version__
version = version.rsplit('.dev', 1)[0] version = version.rsplit(".dev", 1)[0]
comp_table = get_json(about.__compatibility__, "compatibility table") comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table['spacy'] comp = comp_table["spacy"]
if version not in comp: if version not in comp:
prints(Messages.M006.format(version=version), title=Messages.M005, msg.fail(Messages.M005, Messages.M006.format(version=version), exits=1)
exits=1)
return comp[version] return comp[version]
def get_version(model, comp): def get_version(model, comp):
model = model.rsplit('.dev', 1)[0] model = model.rsplit(".dev", 1)[0]
if model not in comp: if model not in comp:
prints(Messages.M007.format(name=model, version=about.__version__), msg.fail(
title=Messages.M005, exits=1) Messages.M005,
Messages.M007.format(name=model, version=about.__version__),
exits=1,
)
return comp[model][0] return comp[model][0]
def download_model(filename, user_pip_args=None): def download_model(filename, user_pip_args=None):
download_url = about.__download_url__ + '/' + filename download_url = about.__download_url__ + "/" + filename
pip_args = ['--no-cache-dir', '--no-deps'] pip_args = ["--no-cache-dir", "--no-deps"]
if user_pip_args: if user_pip_args:
pip_args.extend(user_pip_args) pip_args.extend(user_pip_args)
cmd = [sys.executable, '-m', 'pip', 'install'] + pip_args + [download_url] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
return subprocess.call(cmd, env=os.environ.copy()) return subprocess.call(cmd, env=os.environ.copy())

View File

@ -3,30 +3,35 @@ from __future__ import unicode_literals, division, print_function
import plac import plac
from timeit import default_timer as timer from timeit import default_timer as timer
from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from ..gold import GoldCorpus from ..gold import GoldCorpus
from ..util import prints
from .. import util from .. import util
from .. import displacy from .. import displacy
@plac.annotations( @plac.annotations(
model=("model name or path", "positional", None, str), model=("Model name or path", "positional", None, str),
data_path=("location of JSON-formatted evaluation data", "positional", data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
None, str), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
gold_preproc=("use gold preprocessing", "flag", "G", bool), gpu_id=("Use GPU", "option", "g", int),
gpu_id=("use GPU", "option", "g", int), displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
displacy_path=("directory to output rendered parses as HTML", "option", displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
"dp", str), )
displacy_limit=("limit of parses to render as HTML", "option", "dl", int)) def evaluate(
def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None, model,
displacy_limit=25): data_path,
gpu_id=-1,
gold_preproc=False,
displacy_path=None,
displacy_limit=25,
):
""" """
Evaluate a model. To render a sample of parses in a HTML file, set an Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument. output directory as the displacy_path argument.
""" """
msg = Printer()
util.fix_random_seed() util.fix_random_seed()
if gpu_id >= 0: if gpu_id >= 0:
util.use_gpu(gpu_id) util.use_gpu(gpu_id)
@ -34,9 +39,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
data_path = util.ensure_path(data_path) data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path) displacy_path = util.ensure_path(displacy_path)
if not data_path.exists(): if not data_path.exists():
prints(data_path, title=Messages.M034, exits=1) msg.fail(Messages.M034, data_path, exits=1)
if displacy_path and not displacy_path.exists(): if displacy_path and not displacy_path.exists():
prints(displacy_path, title=Messages.M035, exits=1) msg.fail(Messages.M035, displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path) corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -44,65 +49,80 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
scorer = nlp.evaluate(dev_docs, verbose=False) scorer = nlp.evaluate(dev_docs, verbose=False)
end = timer() end = timer()
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
print_results(scorer, time=end - begin, words=nwords, results = {
wps=nwords / (end - begin)) "Time": "%.2f s" % end - begin,
"Words": nwords,
"Words/s": "%.0f" % nwords / (end - begin),
"TOK": "%.2f" % scorer.token_acc,
"POS": "%.2f" % scorer.tags_acc,
"UAS": "%.2f" % scorer.uas,
"LAS": "%.2f" % scorer.las,
"NER P": "%.2f" % scorer.ents_p,
"NER R": "%.2f" % scorer.ents_r,
"NER F": "%.2f" % scorer.ents_f,
}
msg.table(results, title="Results")
if displacy_path: if displacy_path:
docs, golds = zip(*dev_docs) docs, golds = zip(*dev_docs)
render_deps = 'parser' in nlp.meta.get('pipeline', []) render_deps = "parser" in nlp.meta.get("pipeline", [])
render_ents = 'ner' in nlp.meta.get('pipeline', []) render_ents = "ner" in nlp.meta.get("pipeline", [])
render_parses(docs, displacy_path, model_name=model, render_parses(
limit=displacy_limit, deps=render_deps, ents=render_ents) docs,
prints(displacy_path, title=Messages.M036.format(n=displacy_limit)) displacy_path,
model_name=model,
limit=displacy_limit,
deps=render_deps,
ents=render_ents,
)
msg.good(Messages.M036.format(n=displacy_limit), displacy_path)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
ents=True): docs[0].user_data["title"] = model_name
docs[0].user_data['title'] = model_name
if ents: if ents:
with (output_path / 'entities.html').open('w') as file_: with (output_path / "entities.html").open("w") as file_:
html = displacy.render(docs[:limit], style='ent', page=True) html = displacy.render(docs[:limit], style="ent", page=True)
file_.write(html) file_.write(html)
if deps: if deps:
with (output_path / 'parses.html').open('w') as file_: with (output_path / "parses.html").open("w") as file_:
html = displacy.render(docs[:limit], style='dep', page=True, html = displacy.render(
options={'compact': True}) docs[:limit], style="dep", page=True, options={"compact": True}
)
file_.write(html) file_.write(html)
def print_progress(itn, losses, dev_scores, wps=0.0): def print_progress(itn, losses, dev_scores, wps=0.0):
scores = {} scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', for col in [
'ents_p', 'ents_r', 'ents_f', 'wps']: "dep_loss",
"tag_loss",
"uas",
"tags_acc",
"token_acc",
"ents_p",
"ents_r",
"ents_f",
"wps",
]:
scores[col] = 0.0 scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0) scores["dep_loss"] = losses.get("parser", 0.0)
scores['ner_loss'] = losses.get('ner', 0.0) scores["ner_loss"] = losses.get("ner", 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0) scores["tag_loss"] = losses.get("tagger", 0.0)
scores.update(dev_scores) scores.update(dev_scores)
scores['wps'] = wps scores["wps"] = wps
tpl = '\t'.join(( tpl = "\t".join(
'{:d}', (
'{dep_loss:.3f}', "{:d}",
'{ner_loss:.3f}', "{dep_loss:.3f}",
'{uas:.3f}', "{ner_loss:.3f}",
'{ents_p:.3f}', "{uas:.3f}",
'{ents_r:.3f}', "{ents_p:.3f}",
'{ents_f:.3f}', "{ents_r:.3f}",
'{tags_acc:.3f}', "{ents_f:.3f}",
'{token_acc:.3f}', "{tags_acc:.3f}",
'{wps:.1f}')) "{token_acc:.3f}",
"{wps:.1f}",
)
)
print(tpl.format(itn, **scores)) print(tpl.format(itn, **scores))
def print_results(scorer, time, words, wps):
results = {
'Time': '%.2f s' % time,
'Words': words,
'Words/s': '%.0f' % wps,
'TOK': '%.2f' % scorer.token_acc,
'POS': '%.2f' % scorer.tags_acc,
'UAS': '%.2f' % scorer.uas,
'LAS': '%.2f' % scorer.las,
'NER P': '%.2f' % scorer.ents_p,
'NER R': '%.2f' % scorer.ents_r,
'NER F': '%.2f' % scorer.ents_f}
util.print_table(results, title="Results")

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import plac import plac
import platform import platform
from pathlib import Path from pathlib import Path
from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from ..compat import path2str from ..compat import path2str
@ -12,56 +13,65 @@ from .. import about
@plac.annotations( @plac.annotations(
model=("optional: shortcut link of model", "positional", None, str), model=("Optional shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str), markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
silent=("don't print anything (just return)", "flag", "s")) silent=("Don't print anything (just return)", "flag", "s"),
)
def info(model=None, markdown=False, silent=False): def info(model=None, markdown=False, silent=False):
"""Print info about spaCy installation. If a model shortcut link is """
Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues. prints details in Markdown for easy copy-pasting to GitHub issues.
""" """
msg = Printer()
if model: if model:
if util.is_package(model): if util.is_package(model):
model_path = util.get_package_path(model) model_path = util.get_package_path(model)
else: else:
model_path = util.get_data_path() / model model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json' meta_path = model_path / "meta.json"
if not meta_path.is_file(): if not meta_path.is_file():
util.prints(meta_path, title=Messages.M020, exits=1) msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
if model_path.resolve() != model_path: if model_path.resolve() != model_path:
meta['link'] = path2str(model_path) meta["link"] = path2str(model_path)
meta['source'] = path2str(model_path.resolve()) meta["source"] = path2str(model_path.resolve())
else: else:
meta['source'] = path2str(model_path) meta["source"] = path2str(model_path)
if not silent: if not silent:
print_info(meta, 'model %s' % model, markdown) title = "Info about model '{}'".format(model)
model_meta = {
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
}
if markdown:
util.print_markdown(model_meta, title=title)
else:
msg.table(model_meta, title=title)
return meta return meta
data = {'spaCy version': about.__version__, data = {
'Location': path2str(Path(__file__).parent.parent), "spaCy version": about.__version__,
'Platform': platform.platform(), "Location": path2str(Path(__file__).parent.parent),
'Python version': platform.python_version(), "Platform": platform.platform(),
'Models': list_models()} "Python version": platform.python_version(),
"Models": list_models(),
}
if not silent: if not silent:
print_info(data, 'spaCy', markdown) title = "Info about spaCy"
return data
def print_info(data, title, markdown):
title = 'Info about %s' % title
if markdown: if markdown:
util.print_markdown(data, title=title) util.print_markdown(data, title=title)
else: else:
util.print_table(data, title=title) msg.table(data, title=title)
return data
def list_models(): def list_models():
def exclude_dir(dir_name): def exclude_dir(dir_name):
# exclude common cache directories and hidden directories # exclude common cache directories and hidden directories
exclude = ['cache', 'pycache', '__pycache__'] exclude = ("cache", "pycache", "__pycache__")
return dir_name in exclude or dir_name.startswith('.') return dir_name in exclude or dir_name.startswith(".")
data_path = util.get_data_path() data_path = util.get_data_path()
if data_path: if data_path:
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
return ', '.join([m for m in models if not exclude_dir(m)]) return ", ".join([m for m in models if not exclude_dir(m)])
return '-' return "-"

View File

@ -11,13 +11,12 @@ from preshed.counter import PreshCounter
import tarfile import tarfile
import gzip import gzip
import zipfile import zipfile
import ujson as json from wasabi import Printer
from spacy.lexeme import intify_attrs
from ._messages import Messages from ._messages import Messages
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning from ..errors import Errors, Warnings, user_warning
from ..util import prints, ensure_path, get_lang_class from ..util import ensure_path, get_lang_class, read_jsonl
try: try:
import ftfy import ftfy
@ -25,121 +24,133 @@ except ImportError:
ftfy = None ftfy = None
msg = Printer()
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("Model language", "positional", None, str),
output_dir=("model output directory", "positional", None, Path), output_dir=("Model output directory", "positional", None, Path),
freqs_loc=("location of words frequencies file", "option", "f", Path), freqs_loc=("Location of words frequencies file", "option", "f", Path),
jsonl_loc=("location of JSONL-formatted attributes file", "option", "j", Path), jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
clusters_loc=("optional: location of brown clusters data", clusters_loc=("Optional location of brown clusters data", "option", "c", str),
"option", "c", str), vectors_loc=("Optional vectors file in Word2Vec format" "option", "v", str),
vectors_loc=("optional: location of vectors file in Word2Vec format " prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
"(either as .txt or zipped as .zip or .tar.gz)", "option",
"v", str),
prune_vectors=("optional: number of vectors to prune to",
"option", "V", int)
) )
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, def init_model(
vectors_loc=None, prune_vectors=-1): lang,
output_dir,
freqs_loc=None,
clusters_loc=None,
jsonl_loc=None,
vectors_loc=None,
prune_vectors=-1,
):
""" """
Create a new model from raw data, like word frequencies, Brown clusters Create a new model from raw data, like word frequencies, Brown clusters
and word vectors. and word vectors. If vectors are provided in Word2Vec format, they can
be either a .txt or zipped as a .zip or .tar.gz.
""" """
if jsonl_loc is not None: if jsonl_loc is not None:
if freqs_loc is not None or clusters_loc is not None: if freqs_loc is not None or clusters_loc is not None:
settings = ['-j'] settings = ["-j"]
if freqs_loc: if freqs_loc:
settings.append('-f') settings.append("-f")
if clusters_loc: if clusters_loc:
settings.append('-c') settings.append("-c")
prints(' '.join(settings), msg.warn(Messages.M063, Messages.M064)
title=(
"The -f and -c arguments are deprecated, and not compatible "
"with the -j argument, which should specify the same information. "
"Either merge the frequencies and clusters data into the "
"jsonl-formatted file (recommended), or use only the -f and "
"-c files, without the other lexical attributes."))
jsonl_loc = ensure_path(jsonl_loc) jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = (json.loads(line) for line in jsonl_loc.open()) lex_attrs = read_jsonl(jsonl_loc)
else: else:
clusters_loc = ensure_path(clusters_loc) clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc) freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists(): if freqs_loc is not None and not freqs_loc.exists():
prints(freqs_loc, title=Messages.M037, exits=1) msg.fail(Messages.M037, freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs) nlp = create_model(lang, lex_attrs)
msg.good("Successfully created model")
if vectors_loc is not None: if vectors_loc is not None:
add_vectors(nlp, vectors_loc, prune_vectors) add_vectors(nlp, vectors_loc, prune_vectors)
vec_added = len(nlp.vocab.vectors) vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab) lex_added = len(nlp.vocab)
prints(Messages.M039.format(entries=lex_added, vectors=vec_added), msg.good(Messages.M038, Messages.M039.format(entries=lex_added, vectors=vec_added))
title=Messages.M038)
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
return nlp return nlp
def open_file(loc): def open_file(loc):
'''Handle .gz, .tar.gz or unzipped files''' """Handle .gz, .tar.gz or unzipped files"""
loc = ensure_path(loc) loc = ensure_path(loc)
print("Open loc")
if tarfile.is_tarfile(str(loc)): if tarfile.is_tarfile(str(loc)):
return tarfile.open(str(loc), 'r:gz') return tarfile.open(str(loc), "r:gz")
elif loc.parts[-1].endswith('gz'): elif loc.parts[-1].endswith("gz"):
return (line.decode('utf8') for line in gzip.open(str(loc), 'r')) return (line.decode("utf8") for line in gzip.open(str(loc), "r"))
elif loc.parts[-1].endswith('zip'): elif loc.parts[-1].endswith("zip"):
zip_file = zipfile.ZipFile(str(loc)) zip_file = zipfile.ZipFile(str(loc))
names = zip_file.namelist() names = zip_file.namelist()
file_ = zip_file.open(names[0]) file_ = zip_file.open(names[0])
return (line.decode('utf8') for line in file_) return (line.decode("utf8") for line in file_)
else: else:
return loc.open('r', encoding='utf8') return loc.open("r", encoding="utf8")
def read_attrs_from_deprecated(freqs_loc, clusters_loc): def read_attrs_from_deprecated(freqs_loc, clusters_loc):
with msg.loading("Counting frequencies..."):
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
msg.good("Counted frequencies")
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc) if clusters_loc else {} clusters = read_clusters(clusters_loc) if clusters_loc else {}
msg.good("Read clusters")
lex_attrs = [] lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True) sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
for i, (word, prob) in tqdm(enumerate(sorted_probs)): for i, (word, prob) in tqdm(enumerate(sorted_probs)):
attrs = {'orth': word, 'id': i, 'prob': prob} attrs = {"orth": word, "id": i, "prob": prob}
# Decode as a little-endian string, so that we can do & 15 to get # Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx # the first 4 bits. See _parse_features.pyx
if word in clusters: if word in clusters:
attrs['cluster'] = int(clusters[word][::-1], 2) attrs["cluster"] = int(clusters[word][::-1], 2)
else: else:
attrs['cluster'] = 0 attrs["cluster"] = 0
lex_attrs.append(attrs) lex_attrs.append(attrs)
return lex_attrs return lex_attrs
def create_model(lang, lex_attrs): def create_model(lang, lex_attrs):
print("Creating model...")
lang_class = get_lang_class(lang) lang_class = get_lang_class(lang)
nlp = lang_class() nlp = lang_class()
for lexeme in nlp.vocab: for lexeme in nlp.vocab:
lexeme.rank = 0 lexeme.rank = 0
lex_added = 0 lex_added = 0
for attrs in lex_attrs: for attrs in lex_attrs:
if 'settings' in attrs: if "settings" in attrs:
continue continue
lexeme = nlp.vocab[attrs['orth']] lexeme = nlp.vocab[attrs["orth"]]
lexeme.set_attrs(**attrs) lexeme.set_attrs(**attrs)
lexeme.is_oov = False lexeme.is_oov = False
lex_added += 1 lex_added += 1
lex_added += 1 lex_added += 1
oov_prob = min(lex.prob for lex in nlp.vocab) oov_prob = min(lex.prob for lex in nlp.vocab)
nlp.vocab.cfg.update({'oov_prob': oov_prob-1}) nlp.vocab.cfg.update({"oov_prob": oov_prob - 1})
return nlp return nlp
def add_vectors(nlp, vectors_loc, prune_vectors): def add_vectors(nlp, vectors_loc, prune_vectors):
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
if vectors_loc and vectors_loc.parts[-1].endswith('.npz'): if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open('rb'))) nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
for lex in nlp.vocab: for lex in nlp.vocab:
if lex.rank: if lex.rank:
nlp.vocab.vectors.add(lex.orth, row=lex.rank) nlp.vocab.vectors.add(lex.orth, row=lex.rank)
else: else:
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) if vectors_loc:
with msg.loading("Reading vectors from {}".format(vectors_loc)):
vectors_data, vector_keys = read_vectors(vectors_loc)
msg.good("Loaded vectors from {}".format(vectors_loc))
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None: if vector_keys is not None:
for word in vector_keys: for word in vector_keys:
if word not in nlp.vocab: if word not in nlp.vocab:
@ -147,35 +158,34 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
lexeme.is_oov = False lexeme.is_oov = False
if vectors_data is not None: if vectors_data is not None:
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
nlp.vocab.vectors.name = '%s_model.vectors' % nlp.meta['lang'] nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
nlp.meta['vectors']['name'] = nlp.vocab.vectors.name nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune_vectors >= 1: if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors) nlp.vocab.prune_vectors(prune_vectors)
def read_vectors(vectors_loc): def read_vectors(vectors_loc):
print("Reading vectors from %s" % vectors_loc)
f = open_file(vectors_loc) f = open_file(vectors_loc)
shape = tuple(int(size) for size in next(f).split()) shape = tuple(int(size) for size in next(f).split())
vectors_data = numpy.zeros(shape=shape, dtype='f') vectors_data = numpy.zeros(shape=shape, dtype="f")
vectors_keys = [] vectors_keys = []
for i, line in enumerate(tqdm(f)): for i, line in enumerate(tqdm(f)):
line = line.rstrip() line = line.rstrip()
pieces = line.rsplit(' ', vectors_data.shape[1]+1) pieces = line.rsplit(" ", vectors_data.shape[1] + 1)
word = pieces.pop(0) word = pieces.pop(0)
if len(pieces) != vectors_data.shape[1]: if len(pieces) != vectors_data.shape[1]:
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)) msg.fail(Errors.E094.format(line_num=i, loc=vectors_loc), exits=1)
vectors_data[i] = numpy.asarray(pieces, dtype='f') vectors_data[i] = numpy.asarray(pieces, dtype="f")
vectors_keys.append(word) vectors_keys.append(word)
return vectors_data, vectors_keys return vectors_data, vectors_keys
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
print("Counting frequencies...")
counts = PreshCounter() counts = PreshCounter()
total = 0 total = 0
with freqs_loc.open() as f: with freqs_loc.open() as f:
for i, line in enumerate(f): for i, line in enumerate(f):
freq, doc_freq, key = line.rstrip().split('\t', 2) freq, doc_freq, key = line.rstrip().split("\t", 2)
freq = int(freq) freq = int(freq)
counts.inc(i + 1, freq) counts.inc(i + 1, freq)
total += freq total += freq
@ -184,7 +194,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
probs = {} probs = {}
with freqs_loc.open() as f: with freqs_loc.open() as f:
for line in tqdm(f): for line in tqdm(f):
freq, doc_freq, key = line.rstrip().split('\t', 2) freq, doc_freq, key = line.rstrip().split("\t", 2)
doc_freq = int(doc_freq) doc_freq = int(doc_freq)
freq = int(freq) freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
@ -196,7 +206,6 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
def read_clusters(clusters_loc): def read_clusters(clusters_loc):
print("Reading clusters...")
clusters = {} clusters = {}
if ftfy is None: if ftfy is None:
user_warning(Warnings.W004) user_warning(Warnings.W004)
@ -213,7 +222,7 @@ def read_clusters(clusters_loc):
if int(freq) >= 3: if int(freq) >= 3:
clusters[word] = cluster clusters[word] = cluster
else: else:
clusters[word] = '0' clusters[word] = "0"
# Expand clusters with re-casing # Expand clusters with re-casing
for word, cluster in list(clusters.items()): for word, cluster in list(clusters.items()):
if word.lower() not in clusters: if word.lower() not in clusters:

View File

@ -3,51 +3,54 @@ from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from ..compat import symlink_to, path2str from ..compat import symlink_to, path2str
from ..util import prints
from .. import util from .. import util
@plac.annotations( @plac.annotations(
origin=("package name or local path to model", "positional", None, str), origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str), link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)) force=("force overwriting of existing link", "flag", "f", bool),
)
def link(origin, link_name, force=False, model_path=None): def link(origin, link_name, force=False, model_path=None):
""" """
Create a symlink for models within the spacy/data directory. Accepts Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name). directory. Linking models allows loading them via spacy.load(link_name).
""" """
msg = Printer()
if util.is_package(origin): if util.is_package(origin):
model_path = util.get_package_path(origin) model_path = util.get_package_path(origin)
else: else:
model_path = Path(origin) if model_path is None else Path(model_path) model_path = Path(origin) if model_path is None else Path(model_path)
if not model_path.exists(): if not model_path.exists():
prints(Messages.M009.format(path=path2str(model_path)), msg.fail(
title=Messages.M008, exits=1) Messages.M008, Messages.M009.format(path=path2str(model_path)), exits=1
)
data_path = util.get_data_path() data_path = util.get_data_path()
if not data_path or not data_path.exists(): if not data_path or not data_path.exists():
spacy_loc = Path(__file__).parent.parent spacy_loc = Path(__file__).parent.parent
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1) msg.fail(Messages.M010, Messages.M011.format(path=spacy_loc), exits=1)
link_path = util.get_data_path() / link_name link_path = util.get_data_path() / link_name
if link_path.is_symlink() and not force: if link_path.is_symlink() and not force:
prints(Messages.M013, title=Messages.M012.format(name=link_name), msg.fail(Messages.M012.format(name=link_name), Messages.M013, exits=1)
exits=1)
elif link_path.is_symlink(): # does a symlink exist? elif link_path.is_symlink(): # does a symlink exist?
# NB: It's important to check for is_symlink here and not for exists, # NB: It's important to check for is_symlink here and not for exists,
# because invalid/outdated symlinks would return False otherwise. # because invalid/outdated symlinks would return False otherwise.
link_path.unlink() link_path.unlink()
elif link_path.exists(): # does it exist otherwise? elif link_path.exists(): # does it exist otherwise?
# NB: Check this last because valid symlinks also "exist". # NB: Check this last because valid symlinks also "exist".
prints(Messages.M015, link_path, msg.fail(Messages.M014.format(name=link_name), Messages.M015, exits=1)
title=Messages.M014.format(name=link_name), exits=1) details = "%s --> %s" % (path2str(model_path), path2str(link_path))
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
try: try:
symlink_to(link_path, model_path) symlink_to(link_path, model_path)
except: except: # noqa: E722
# This is quite dirty, but just making sure other errors are caught. # This is quite dirty, but just making sure other errors are caught.
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name)) msg.fail(Messages.M016.format(name=link_name), Messages.M017)
msg.text(details)
raise raise
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018) msg.good(Messages.M018, details)
msg.text(Messages.M019.format(name=link_name))

View File

@ -4,109 +4,106 @@ from __future__ import unicode_literals
import plac import plac
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import Printer, get_raw_input
from ._messages import Messages from ._messages import Messages
from ..compat import path2str, json_dumps from ..compat import path2str, json_dumps
from ..util import prints
from .. import util from .. import util
from .. import about from .. import about
@plac.annotations( @plac.annotations(
input_dir=("directory with model data", "positional", None, str), input_dir=("Directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str), output_dir=("Output parent directory", "positional", None, str),
meta_path=("path to meta.json", "option", "m", str), meta_path=("Path to meta.json", "option", "m", str),
create_meta=("create meta.json, even if one exists in directory if " create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
"existing meta is found, entries are shown as defaults in " force=("Force overwriting existing model in output directory", "flag", "f", bool),
"the command line prompt", "flag", "c", bool), )
force=("force overwriting of existing model directory in output directory", def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
"flag", "f", bool))
def package(input_dir, output_dir, meta_path=None, create_meta=False,
force=False):
""" """
Generate Python package for model data, including meta and required Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified installation files. A new directory will be created in the specified
output directory, and model data will be copied over. output directory, and model data will be copied over. If --create-meta is
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
""" """
msg = Printer()
input_path = util.ensure_path(input_dir) input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)
if not input_path or not input_path.exists(): if not input_path or not input_path.exists():
prints(input_path, title=Messages.M008, exits=1) msg.fail(Messages.M008, input_path, exits=1)
if not output_path or not output_path.exists(): if not output_path or not output_path.exists():
prints(output_path, title=Messages.M040, exits=1) msg.fail(Messages.M040, output_path, exits=1)
if meta_path and not meta_path.exists(): if meta_path and not meta_path.exists():
prints(meta_path, title=Messages.M020, exits=1) msg.fail(Messages.M020, meta_path, exits=1)
meta_path = meta_path or input_path / 'meta.json' meta_path = meta_path or input_path / "meta.json"
if meta_path.is_file(): if meta_path.is_file():
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
if not create_meta: # only print this if user doesn't want to overwrite if not create_meta: # only print if user doesn't want to overwrite
prints(meta_path, title=Messages.M041) msg.good(Messages.M041, meta_path)
else: else:
meta = generate_meta(input_dir, meta) meta = generate_meta(input_dir, meta, msg)
meta = validate_meta(meta, ['lang', 'name', 'version']) for key in ("lang", "name", "version"):
model_name = meta['lang'] + '_' + meta['name'] if key not in meta or meta[key] == "":
model_name_v = model_name + '-' + meta['version'] msg.fail(Messages.M048.format(key=key), Messages.M049, exits=1)
model_name = meta["lang"] + "_" + meta["name"]
model_name_v = model_name + "-" + meta["version"]
main_path = output_path / model_name_v main_path = output_path / model_name_v
package_path = main_path / model_name package_path = main_path / model_name
create_dirs(package_path, force)
shutil.copytree(path2str(input_path),
path2str(package_path / model_name_v))
create_file(main_path / 'meta.json', json_dumps(meta))
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
create_file(package_path / '__init__.py', TEMPLATE_INIT)
prints(main_path, Messages.M043,
title=Messages.M042.format(name=model_name_v))
def create_dirs(package_path, force):
if package_path.exists(): if package_path.exists():
if force: if force:
shutil.rmtree(path2str(package_path)) shutil.rmtree(path2str(package_path))
else: else:
prints(package_path, Messages.M045, title=Messages.M044, exits=1) msg.fail(
Messages.M044,
Messages.M045.format(path=path2str(package_path)),
exits=1,
)
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
create_file(main_path / "meta.json", json_dumps(meta))
create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT)
msg.good(Messages.M042.format(name=model_name_v), main_path)
msg.text(Messages.M043)
def create_file(file_path, contents): def create_file(file_path, contents):
file_path.touch() file_path.touch()
file_path.open('w', encoding='utf-8').write(contents) file_path.open("w", encoding="utf-8").write(contents)
def generate_meta(model_path, existing_meta): def generate_meta(model_path, existing_meta, msg):
meta = existing_meta or {} meta = existing_meta or {}
settings = [('lang', 'Model language', meta.get('lang', 'en')), settings = [
('name', 'Model name', meta.get('name', 'model')), ("lang", "Model language", meta.get("lang", "en")),
('version', 'Model version', meta.get('version', '0.0.0')), ("name", "Model name", meta.get("name", "model")),
('spacy_version', 'Required spaCy version', ("version", "Model version", meta.get("version", "0.0.0")),
'>=%s,<3.0.0' % about.__version__), ("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
('description', 'Model description', ("description", "Model description", meta.get("description", False)),
meta.get('description', False)), ("author", "Author", meta.get("author", False)),
('author', 'Author', meta.get('author', False)), ("email", "Author email", meta.get("email", False)),
('email', 'Author email', meta.get('email', False)), ("url", "Author website", meta.get("url", False)),
('url', 'Author website', meta.get('url', False)), ("license", "License", meta.get("license", "CC BY-SA 3.0")),
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))] ]
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta['pipeline'] = nlp.pipe_names meta["pipeline"] = nlp.pipe_names
meta['vectors'] = {'width': nlp.vocab.vectors_length, meta["vectors"] = {
'vectors': len(nlp.vocab.vectors), "width": nlp.vocab.vectors_length,
'keys': nlp.vocab.vectors.n_keys} "vectors": len(nlp.vocab.vectors),
prints(Messages.M047, title=Messages.M046) "keys": nlp.vocab.vectors.n_keys,
}
msg.divider(Messages.M046)
msg.text(Messages.M047)
for setting, desc, default in settings: for setting, desc, default in settings:
response = util.get_raw_input(desc, default) response = get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response meta[setting] = default if response == "" and default else response
if about.__title__ != 'spacy': if about.__title__ != "spacy":
meta['parent_package'] = about.__title__ meta["parent_package"] = about.__title__
return meta
def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
return meta return meta

View File

@ -1,66 +1,148 @@
'''This script is experimental. # coding: utf8
Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pre-trained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word.
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
corpus, and then training with only 100 labels. Note that it's a bit dirty to
pre-train with the development data, but also not *so* terrible: we're not using
the development labels, after all --- only the unlabelled text.
'''
from __future__ import print_function, unicode_literals from __future__ import print_function, unicode_literals
import plac import plac
import random import random
import numpy import numpy
import time import time
import ujson as json import ujson
from pathlib import Path
import sys import sys
from collections import Counter from collections import Counter
from pathlib import Path
import spacy
from spacy.tokens import Doc
from spacy.attrs import ID, HEAD
from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path
from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from thinc.v2v import Affine, Maxout from thinc.v2v import Affine, Maxout
from thinc.api import wrap from thinc.api import wrap
from thinc.misc import LayerNorm as LN from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu
from wasabi import Printer
from ..tokens import Doc
from ..attrs import ID, HEAD
from ..compat import json_dumps
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from .. import util
def prefer_gpu(): @plac.annotations(
used = spacy.util.use_gpu(0) texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
if used is None: vectors_model=("Name or path to vectors model to learn from"),
return False output_dir=("Directory to write models each epoch", "positional", None, str),
else: width=("Width of CNN layers", "option", "cw", int),
import cupy.random depth=("Depth of CNN layers", "option", "cd", int),
cupy.random.seed(0) embed_rows=("Embedding rows", "option", "er", int),
return True use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
dropout=("Dropout", "option", "d", float),
seed=("Seed for random number generators", "option", "s", float),
nr_iter=("Number of iterations to pretrain", "option", "i", int),
)
def pretrain(
texts_loc,
vectors_model,
output_dir,
width=96,
depth=4,
embed_rows=2000,
use_vectors=False,
dropout=0.2,
nr_iter=1000,
seed=0,
):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Specifically, we load
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
vectors which match the pre-trained ones. The weights are saved to a directory
after each epoch. You can then pass a path to one of these pre-trained weights
files to the 'spacy train' command.
This technique may be especially helpful if you have little labelled data.
However, it's still quite experimental, so your mileage may vary.
def load_texts(path): To load the weights back in during 'spacy train', you need to ensure
'''Load inputs from a jsonl file. all settings are the same between pretraining and training. The API and
errors around this need some improvement.
"""
config = dict(locals())
msg = Printer()
util.fix_random_seed(seed)
Each line should be a dict like {"text": "..."} has_gpu = prefer_gpu()
''' msg.info("Using GPU" if has_gpu else "Not using GPU")
path = ensure_path(path)
with path.open('r', encoding='utf8') as file_: output_dir = Path(output_dir)
texts = [json.loads(line) for line in file_] if not output_dir.exists():
output_dir.mkdir()
msg.good("Created output directory")
util.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")
# Load texts from file or stdin
if texts_loc != "-": # reading from a file
texts_loc = Path(texts_loc)
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
with msg.loading("Loading input texts..."):
texts = list(util.read_jsonl(texts_loc))
msg.good("Loaded input texts")
random.shuffle(texts)
else: # reading from stdin
msg.text("Reading input text from stdin...")
texts = stream_texts()
with msg.loading("Loading model '{}'...".format(vectors_model)):
nlp = util.load_model(vectors_model)
msg.good("Loaded model '{}'".format(vectors_model))
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
model = create_pretraining_model(
nlp,
Tok2Vec(
width,
embed_rows,
conv_depth=depth,
pretrained_vectors=pretrained_vectors,
bilstm_depth=0, # Requires PyTorch. Experimental.
cnn_maxout_pieces=2, # You can try setting this higher
subword_features=True,
),
) # Set to False for character models, e.g. Chinese
optimizer = create_default_optimizer(model.ops)
tracker = ProgressTracker()
msg.divider("Pre-training tok2vec layer")
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
for epoch in range(nr_iter):
for batch in util.minibatch_by_words(
((text, None) for text in texts), size=5000
):
docs = make_docs(nlp, [text for (text, _) in batch])
loss = make_update(model, docs, optimizer, drop=dropout)
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
break
with model.use_params(optimizer.averages):
with (output_dir / ("model%d.bin" % epoch)).open("wb") as file_:
file_.write(model.tok2vec.to_bytes())
log = {
"nr_word": tracker.nr_word,
"loss": tracker.loss,
"epoch_loss": tracker.epoch_loss,
"epoch": epoch,
}
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(json_dumps(log) + "\n")
tracker.epoch_loss = 0.0
if texts_loc != "-":
# Reshuffle the texts if texts were loaded from a file
random.shuffle(texts) random.shuffle(texts)
return texts
def stream_texts(): def stream_texts():
for line in sys.stdin: for line in sys.stdin:
yield json.loads(line) yield ujson.loads(line)
def make_update(model, docs, optimizer, drop=0.): def make_update(model, docs, optimizer, drop=0.0):
"""Perform an update over a single batch of documents. """Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -74,7 +156,7 @@ def make_update(model, docs, optimizer, drop=0.):
# Don't want to return a cupy object here # Don't want to return a cupy object here
# The gradients are modified in-place by the BERT MLM, # The gradients are modified in-place by the BERT MLM,
# so we get an accurate loss # so we get an accurate loss
loss = float((gradients**2).mean()) loss = float((gradients ** 2).mean())
return loss return loss
@ -115,43 +197,40 @@ def get_vectors_loss(ops, docs, prediction):
def create_pretraining_model(nlp, tok2vec): def create_pretraining_model(nlp, tok2vec):
'''Define a network for the pretraining. We simply add an output layer onto """Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays. takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc. Each array in the output needs to have one row per token in the doc.
''' """
output_size = nlp.vocab.vectors.data.shape[1] output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
LN(Maxout(300, pieces=3)), LN(Maxout(300, pieces=3)), zero_init(Affine(output_size, drop_factor=0.0))
zero_init(Affine(output_size, drop_factor=0.0))
) )
# This is annoying, but the parser etc have the flatten step after # This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match # the tok2vec. To load the weights in cleanly, we need to match
# the shape of the models' components exactly. So what we cann # the shape of the models' components exactly. So what we cann
# "tok2vec" has to be the same set of processes as what the components do. # "tok2vec" has to be the same set of processes as what the components do.
tok2vec = chain(tok2vec, flatten) tok2vec = chain(tok2vec, flatten)
model = chain( model = chain(tok2vec, output_layer)
tok2vec,
output_layer
)
model = masked_language_model(nlp.vocab, model) model = masked_language_model(nlp.vocab, model)
model.tok2vec = tok2vec model.tok2vec = tok2vec
model.output_layer = output_layer model.output_layer = output_layer
model.begin_training([nlp.make_doc('Give it a doc to infer shapes')]) model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
return model return model
def masked_language_model(vocab, model, mask_prob=0.15): def masked_language_model(vocab, model, mask_prob=0.15):
'''Convert a model into a BERT-style masked language model''' """Convert a model into a BERT-style masked language model"""
random_words = RandomWords(vocab) random_words = RandomWords(vocab)
def mlm_forward(docs, drop=0.):
def mlm_forward(docs, drop=0.0):
mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob) mask, docs = apply_mask(docs, random_words, mask_prob=mask_prob)
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
output, backprop = model.begin_update(docs, drop=drop) output, backprop = model.begin_update(docs, drop=drop)
def mlm_backward(d_output, sgd=None): def mlm_backward(d_output, sgd=None):
d_output *= 1-mask d_output *= 1 - mask
return backprop(d_output, sgd=sgd) return backprop(d_output, sgd=sgd)
return output, mlm_backward return output, mlm_backward
@ -161,7 +240,7 @@ def masked_language_model(vocab, model, mask_prob=0.15):
def apply_mask(docs, random_words, mask_prob=0.15): def apply_mask(docs, random_words, mask_prob=0.15):
N = sum(len(doc) for doc in docs) N = sum(len(doc) for doc in docs)
mask = numpy.random.uniform(0., 1.0, (N,)) mask = numpy.random.uniform(0.0, 1.0, (N,))
mask = mask >= mask_prob mask = mask >= mask_prob
i = 0 i = 0
masked_docs = [] masked_docs = []
@ -184,7 +263,7 @@ def apply_mask(docs, random_words, mask_prob=0.15):
return mask, masked_docs return mask, masked_docs
def replace_word(word, random_words, mask='[MASK]'): def replace_word(word, random_words, mask="[MASK]"):
roll = random.random() roll = random.random()
if roll < 0.8: if roll < 0.8:
return mask return mask
@ -193,20 +272,22 @@ def replace_word(word, random_words, mask='[MASK]'):
else: else:
return word return word
class RandomWords(object): class RandomWords(object):
def __init__(self, vocab): def __init__(self, vocab):
self.words = [lex.text for lex in vocab if lex.prob != 0.0] self.words = [lex.text for lex in vocab if lex.prob != 0.0]
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
self.words = self.words[:10000] self.words = self.words[:10000]
self.probs = self.probs[:10000] self.probs = self.probs[:10000]
self.probs = numpy.exp(numpy.array(self.probs, dtype='f')) self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
self.probs /= self.probs.sum() self.probs /= self.probs.sum()
self._cache = [] self._cache = []
def next(self): def next(self):
if not self._cache: if not self._cache:
self._cache.extend(numpy.random.choice(len(self.words), 10000, self._cache.extend(
p=self.probs)) numpy.random.choice(len(self.words), 10000, p=self.probs)
)
index = self._cache.pop() index = self._cache.pop()
return self.words[index] return self.words[index]
@ -245,76 +326,3 @@ class ProgressTracker(object):
return status return status
else: else:
return None return None
@plac.annotations(
texts_loc=("Path to jsonl file with texts to learn from", "positional", None, str),
vectors_model=("Name or path to vectors model to learn from"),
output_dir=("Directory to write models each epoch", "positional", None, str),
width=("Width of CNN layers", "option", "cw", int),
depth=("Depth of CNN layers", "option", "cd", int),
embed_rows=("Embedding rows", "option", "er", int),
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
dropout=("Dropout", "option", "d", float),
seed=("Seed for random number generators", "option", "s", float),
nr_iter=("Number of iterations to pretrain", "option", "i", int),
)
def pretrain(texts_loc, vectors_model, output_dir, width=96, depth=4,
embed_rows=2000, use_vectors=False, dropout=0.2, nr_iter=1000, seed=0):
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Specifically, we load
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
vectors which match the pre-trained ones. The weights are saved to a directory
after each epoch. You can then pass a path to one of these pre-trained weights
files to the 'spacy train' command.
This technique may be especially helpful if you have little labelled data.
However, it's still quite experimental, so your mileage may vary.
To load the weights back in during 'spacy train', you need to ensure
all settings are the same between pretraining and training. The API and
errors around this need some improvement.
"""
config = dict(locals())
output_dir = ensure_path(output_dir)
random.seed(seed)
numpy.random.seed(seed)
if not output_dir.exists():
output_dir.mkdir()
with (output_dir / 'config.json').open('w') as file_:
file_.write(json.dumps(config))
has_gpu = prefer_gpu()
print("Use GPU?", has_gpu)
nlp = spacy.load(vectors_model)
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
model = create_pretraining_model(nlp,
Tok2Vec(width, embed_rows,
conv_depth=depth,
pretrained_vectors=pretrained_vectors,
bilstm_depth=0, # Requires PyTorch. Experimental.
cnn_maxout_pieces=2, # You can try setting this higher
subword_features=True)) # Set to False for character models, e.g. Chinese
optimizer = create_default_optimizer(model.ops)
tracker = ProgressTracker()
print('Epoch', '#Words', 'Loss', 'w/s')
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
for epoch in range(nr_iter):
for batch in minibatch_by_words(((text, None) for text in texts), size=5000):
docs = make_docs(nlp, [text for (text, _) in batch])
loss = make_update(model, docs, optimizer, drop=dropout)
progress = tracker.update(epoch, loss, docs)
if progress:
print(*progress)
if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7:
break
with model.use_params(optimizer.averages):
with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_:
file_.write(model.tok2vec.to_bytes())
with (output_dir / 'log.jsonl').open('a') as file_:
file_.write(json.dumps({'nr_word': tracker.nr_word,
'loss': tracker.loss, 'epoch_loss': tracker.epoch_loss,
'epoch': epoch}) + '\n')
tracker.epoch_loss = 0.0
if texts_loc != '-':
texts = load_texts(texts_loc)

View File

@ -6,45 +6,64 @@ from pathlib import Path
import ujson import ujson
import cProfile import cProfile
import pstats import pstats
import spacy
import sys import sys
import tqdm import tqdm
import cytoolz import cytoolz
import thinc.extra.datasets import thinc.extra.datasets
from wasabi import Printer
from ..util import load_model
def read_inputs(loc):
if loc is None:
file_ = sys.stdin
file_ = (line.encode('utf8') for line in file_)
else:
file_ = Path(loc).open()
for line in file_:
data = ujson.loads(line)
text = data['text']
yield text
@plac.annotations( @plac.annotations(
lang=("model/language", "positional", None, str), model=("Model to load", "positional", None, str),
inputs=("Location of input file", "positional", None, read_inputs)) inputs=("Location of input file. '-' for stdin.", "positional", None, str),
def profile(lang, inputs=None): n_texts=("Maximum number of texts to use if available", "option", "n", int),
)
def profile(model, inputs=None, n_texts=10000):
""" """
Profile a spaCy pipeline, to find out which functions take the most time. Profile a spaCy pipeline, to find out which functions take the most time.
Input should be formatted as one JSON object per line with a key "text".
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
""" """
msg = Printer()
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None: if inputs is None:
n_inputs = 25000
with msg.loading("Loading IMDB dataset via Thinc..."):
imdb_train, _ = thinc.extra.datasets.imdb() imdb_train, _ = thinc.extra.datasets.imdb()
inputs, _ = zip(*imdb_train) inputs, _ = zip(*imdb_train)
inputs = inputs[:25000] msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
nlp = spacy.load(lang) inputs = inputs[:n_inputs]
texts = list(cytoolz.take(10000, inputs)) with msg.loading("Loading model '{}'...".format(model)):
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), nlp = load_model(model)
"Profile.prof") msg.good("Loaded model '{}'".format(model))
texts = list(cytoolz.take(n_texts, inputs))
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof") s = pstats.Stats("Profile.prof")
msg.divider("Profile stats")
s.strip_dirs().sort_stats("time").print_stats() s.strip_dirs().sort_stats("time").print_stats()
def parse_texts(nlp, texts): def parse_texts(nlp, texts):
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
pass pass
def _read_inputs(loc, msg):
if loc == "-":
msg.info("Reading input from sys.stdin")
file_ = sys.stdin
file_ = (line.encode("utf8") for line in file_)
else:
input_path = Path(loc)
if not input_path.exists() or not input_path.is_file():
msg.fail("Not a valid input data file", loc, exits=1)
msg.info("Using data from {}".format(input_path.parts[-1]))
file_ = input_path.open()
for line in file_:
data = ujson.loads(line)
text = data["text"]
yield text

View File

@ -0,0 +1,51 @@
# coding: utf-8
from __future__ import unicode_literals
from pathlib import Path
from jsonschema import Draft4Validator
from ...errors import Errors
from ...util import read_json
SCHEMAS = {}
def get_schema(name):
"""Get the JSON schema for a given name. Looks for a .json file in
spacy.cli.schemas, validates the schema and raises ValueError if not found.
EXAMPLE:
>>> schema = get_schema('training')
name (unicode): The name of the schema.
RETURNS (dict): The JSON schema.
"""
if name not in SCHEMAS:
schema_path = Path(__file__).parent / "{}.json".format(name)
if not schema_path.exists():
raise ValueError(Errors.E104.format(name=name))
schema = read_json(schema_path)
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
validator.check_schema(schema)
SCHEMAS[name] = schema
return SCHEMAS[name]
def validate_json(data, schema):
"""Validate data against a given JSON schema (see https://json-schema.org).
data: JSON-serializable data to validate.
schema (dict): The JSON schema.
RETURNS (list): A list of error messages, if available.
"""
validator = Draft4Validator(schema)
errors = []
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
if err.path:
err_path = "[{}]".format(" -> ".join([str(p) for p in err.path]))
else:
err_path = ""
errors.append(err.message + " " + err_path)
return errors

128
spacy/cli/schemas/meta.json Normal file
View File

@ -0,0 +1,128 @@
{
"$schema": "http://json-schema.org/draft-06/schema",
"type": "object",
"properties": {
"lang": {
"title": "Two-letter language code, e.g. 'en'",
"type": "string",
"minLength": 2,
"maxLength": 2,
"pattern": "^[a-z]*$"
},
"name": {
"title": "Model name",
"type": "string",
"minLength": 1,
"pattern": "^[a-z_]*$"
},
"version": {
"title": "Model version",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-]*$"
},
"spacy_version": {
"title": "Compatible spaCy version identifier",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-><=]*$"
},
"parent_package": {
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
"type": "string",
"minLength": 1,
"default": "spacy"
},
"pipeline": {
"title": "Names of pipeline components",
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
},
"description": {
"title": "Model description",
"type": "string"
},
"license": {
"title": "Model license",
"type": "string"
},
"author": {
"title": "Model author name",
"type": "string"
},
"email": {
"title": "Model author email",
"type": "string",
"format": "email"
},
"url": {
"title": "Model author URL",
"type": "string",
"format": "uri"
},
"sources": {
"title": "Training data sources",
"type": "array",
"items": {
"type": "string"
}
},
"vectors": {
"title": "Included word vectors",
"type": "object",
"properties": {
"keys": {
"title": "Number of unique keys",
"type": "integer",
"minimum": 0
},
"vectors": {
"title": "Number of unique vectors",
"type": "integer",
"minimum": 0
},
"width": {
"title": "Number of dimensions",
"type": "integer",
"minimum": 0
}
}
},
"accuracy": {
"title": "Accuracy numbers",
"type": "object",
"patternProperties": {
"*": {
"type": "number",
"minimum": 0.0
}
}
},
"speed": {
"title": "Speed evaluation numbers",
"type": "object",
"patternProperties": {
"*": {
"oneOf": [
{
"type": "number",
"minimum": 0.0
},
{
"type": "integer",
"minimum": 0
}
]
}
}
}
},
"required": [
"lang",
"name",
"version"
]
}

View File

@ -0,0 +1,146 @@
{
"$schema": "http://json-schema.org/draft-06/schema",
"title": "Training data for spaCy models",
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {
"title": "The text of the training example",
"type": "string",
"minLength": 1
},
"ents": {
"title": "Named entity spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0
},
"label": {
"title": "Entity label",
"type": "string",
"minLength": 1,
"pattern": "^[A-Z0-9]*$"
}
},
"required": [
"start",
"end",
"label"
]
}
},
"sents": {
"title": "Sentence spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0
}
},
"required": [
"start",
"end"
]
}
},
"cats": {
"title": "Text categories for the text classifier",
"type": "object",
"patternProperties": {
"*": {
"title": "A text category",
"oneOf": [
{
"type": "boolean"
},
{
"type": "number",
"minimum": 0
}
]
}
},
"propertyNames": {
"pattern": "^[A-Z0-9]*$",
"minLength": 1
}
},
"tokens": {
"title": "The tokens in the text",
"type": "array",
"items": {
"type": "object",
"minProperties": 1,
"properties": {
"id": {
"title": "Token ID, usually token index",
"type": "integer",
"minimum": 0
},
"start": {
"title": "Start character offset of the token",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the token",
"type": "integer",
"minimum": 0
},
"pos": {
"title": "Coarse-grained part-of-speech tag",
"type": "string",
"minLength": 1
},
"tag": {
"title": "Fine-grained part-of-speech tag",
"type": "string",
"minLength": 1
},
"dep": {
"title": "Dependency label",
"type": "string",
"minLength": 1
},
"head": {
"title": "Index of the token's head",
"type": "integer",
"minimum": 0
}
},
"required": [
"start",
"end"
]
}
},
"_": {
"title": "Custom user space",
"type": "object"
}
},
"required": [
"text"
]
}
}

View File

@ -6,102 +6,275 @@ from pathlib import Path
import tqdm import tqdm
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from timeit import default_timer as timer from timeit import default_timer as timer
import json
import shutil import shutil
from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from .._ml import create_default_optimizer
from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus from ..gold import GoldCorpus
from ..util import prints, minibatch, minibatch_by_words
from .. import util from .. import util
from .. import about from .. import about
from .. import displacy
from ..compat import json_dumps
# Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training.
dropout_rates = util.decaying(
util.env_opt("dropout_from", 0.2),
util.env_opt("dropout_to", 0.2),
util.env_opt("dropout_decay", 0.0),
)
batch_sizes = util.compounding(
util.env_opt("batch_from", 1000),
util.env_opt("batch_to", 1000),
util.env_opt("batch_compound", 1.001),
)
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("Model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str), output_path=("Output directory to store model in", "positional", None, Path),
train_data=("location of JSON-formatted training data", "positional", train_path=("Location of JSON-formatted training data", "positional", None, Path),
None, str), dev_path=("Location of JSON-formatted development data", "positional", None, Path),
dev_data=("location of JSON-formatted development data (optional)", base_model=("Name of model to update (optional)", "option", "b", str),
"positional", None, str), pipeline=("Comma-separated names of pipeline components", "option", "p", str),
n_iter=("number of iterations", "option", "n", int), vectors=("Model to load vectors from", "option", "v", str),
n_sents=("number of sentences", "option", "ns", int), n_iter=("Number of iterations", "option", "n", int),
n_examples=("Number of examples", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
vectors=("Model to load vectors from", "option", "v"),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool),
parser_multitasks=("Side objectives for parser CNN, e.g. dep dep,tag", "option", "pt", str),
noise_level=("Amount of corruption to add for data augmentation", "option", "nl", float),
entity_multitasks=("Side objectives for ner CNN, e.g. dep dep,tag", "option", "et", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
version=("Model version", "option", "V", str), version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be " meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
"overwritten.", "option", "m", Path), init_tok2vec=(
init_tok2vec=("Path to pretrained weights for the token-to-vector parts " "Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
"of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path), "option",
verbose=("Display more information for debug", "option", None, bool)) "t2v",
def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, Path,
parser_multitasks='', entity_multitasks='', init_tok2vec=None, ),
use_gpu=-1, vectors=None, no_tagger=False, noise_level=0.0, parser_multitasks=(
no_parser=False, no_entities=False, gold_preproc=False, "Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
version="0.0.0", meta_path=None, verbose=False): "option",
"pt",
str,
),
entity_multitasks=(
"Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
"option",
"et",
str,
),
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool),
)
def train(
lang,
output_path,
train_path,
dev_path,
base_model=None,
pipeline="tagger,parser,ner",
vectors=None,
n_iter=30,
n_examples=0,
use_gpu=-1,
version="0.0.0",
meta_path=None,
init_tok2vec=None,
parser_multitasks="",
entity_multitasks="",
noise_level=0.0,
gold_preproc=False,
learn_tokens=False,
verbose=False,
debug=False,
):
""" """
Train a model. Expects data in spaCy's JSON format. Train or update a spaCy model. Requires data to be formatted in spaCy's
JSON format. To convert data from other formats, use the `spacy convert`
command.
""" """
msg = Printer()
util.fix_random_seed() util.fix_random_seed()
util.set_env_log(True) util.set_env_log(verbose)
n_sents = n_sents or None
output_path = util.ensure_path(output_dir) # Make sure all files and paths exists if they are needed
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_path)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_path)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)
if not train_path.exists(): if not train_path or not train_path.exists():
prints(train_path, title=Messages.M050, exits=1) msg.fail(Messages.M050, train_path, exits=1)
if dev_path and not dev_path.exists(): if not dev_path or not dev_path.exists():
prints(dev_path, title=Messages.M051, exits=1) msg.fail(Messages.M051, dev_path, exits=1)
if meta_path is not None and not meta_path.exists(): if meta_path is not None and not meta_path.exists():
prints(meta_path, title=Messages.M020, exits=1) msg.fail(Messages.M020, meta_path, exits=1)
meta = util.read_json(meta_path) if meta_path else {} meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict): if not isinstance(meta, dict):
prints(Messages.M053.format(meta_type=type(meta)), msg.fail(Messages.M052, Messages.M053.format(meta_type=type(meta)), exits=1)
title=Messages.M052, exits=1) if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
meta.setdefault('lang', lang) msg.fail(Messages.M062, Messages.M065)
meta.setdefault('name', 'unnamed')
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
print("Counting training words (limit=%s" % n_sents) # Set up the base model and pipeline. If a base model is specified, load
corpus = GoldCorpus(train_path, dev_path, limit=n_sents) # the model and make sure the pipeline matches the pipeline setting. If
n_train_words = corpus.count_train() # training starts from a blank model, intitalize the language class.
print(n_train_words) pipeline = [p.strip() for p in pipeline.split(",")]
pipeline = ['tagger', 'parser', 'ner'] msg.text(Messages.M055.format(pipeline=pipeline))
if no_tagger and 'tagger' in pipeline: if base_model:
pipeline.remove('tagger') msg.text(Messages.M056.format(model=base_model))
if no_parser and 'parser' in pipeline: nlp = util.load_model(base_model)
pipeline.remove('parser') if nlp.lang != lang:
if no_entities and 'ner' in pipeline: msg.fail(Messages.M072.format(model_lang=nlp.lang, lang=lang), exits=1)
pipeline.remove('ner') other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
nlp.disable_pipes(*other_pipes)
for pipe in pipeline:
if pipe not in nlp.pipe_names:
nlp.add_pipe(nlp.create_pipe(pipe))
else:
msg.text(Messages.M057.format(model=lang))
lang_cls = util.get_lang_class(lang)
nlp = lang_cls()
for pipe in pipeline:
nlp.add_pipe(nlp.create_pipe(pipe))
if learn_tokens:
nlp.add_pipe(nlp.create_pipe("merge_subtokens"))
# Take dropout and batch size as generators of values -- dropout # Take dropout and batch size as generators of values -- dropout
# starts high and decays sharply, to force the optimizer to explore. # starts high and decays sharply, to force the optimizer to explore.
# Batch size starts at 1 and grows, so that we make updates quickly # Batch size starts at 1 and grows, so that we make updates quickly
# at the beginning of training. # at the beginning of training.
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1), dropout_rates = util.decaying(
util.env_opt('dropout_to', 0.1), util.env_opt("dropout_from", 0.1),
util.env_opt('dropout_decay', 0.0)) util.env_opt("dropout_to", 0.1),
batch_sizes = util.compounding(util.env_opt('batch_from', 750), util.env_opt("dropout_decay", 0.0),
util.env_opt('batch_to', 750), )
util.env_opt('batch_compound', 1.001)) batch_sizes = util.compounding(
util.env_opt("batch_from", 750),
util.env_opt("batch_to", 750),
util.env_opt("batch_compound", 1.001),
)
lang_class = util.get_lang_class(lang) lang_class = util.get_lang_class(lang)
nlp = lang_class() nlp = lang_class()
meta['pipeline'] = pipeline meta["pipeline"] = pipeline
nlp.meta.update(meta) nlp.meta.update(meta)
if vectors: if vectors:
print("Load vectors model", vectors) msg.text(Messages.M058.format(model=vectors))
_load_vectors(nlp, vectors)
# Multitask objectives
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
for pipe_name, multitasks in multitask_options:
if multitasks:
if pipe_name not in pipeline:
msg.fail(Messages.M059.format(pipe=pipe_name))
pipe = nlp.get_pipe(pipe_name)
for objective in multitasks.split(","):
pipe.add_multitask_objective(objective)
# Prepare training corpus
msg.text(Messages.M060.format(limit=n_examples))
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
n_train_words = corpus.count_train()
if base_model:
# Start with an existing model, use default optimizer
optimizer = create_default_optimizer(Model.ops)
else:
# Start with a blank model, call begin_training
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
nlp._optimizer = None
# Load in pre-trained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text(Messages.M071.format(components=components))
print(
"\nItn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS"
)
try:
for i in range(n_iter):
train_docs = corpus.train_docs(
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
)
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
if not batch:
continue
docs, golds = zip(*batch)
nlp.update(
docs,
golds,
sgd=optimizer,
drop=next(dropout_rates),
losses=losses,
)
pbar.update(sum(len(doc) for doc in docs))
words_seen += sum(len(doc) for doc in docs)
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ("model%d" % i)
nlp.to_disk(epoch_model_path)
nlp_loaded = util.load_model_from_path(epoch_model_path)
dev_docs = list(corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, debug)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
cpu_wps = nwords / (end_time - start_time)
else:
gpu_wps = nwords / (end_time - start_time)
with Model.use_device("cpu"):
nlp_loaded = util.load_model_from_path(epoch_model_path)
dev_docs = list(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
util.write_json(acc_loc, scorer.scores)
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
meta["spacy_version"] = ">=%s" % about.__version__
meta["accuracy"] = scorer.scores
meta["speed"] = {"nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps}
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
"vectors": len(nlp.vocab.vectors),
"keys": nlp.vocab.vectors.n_keys,
}
meta.setdefault("name", "model%d" % i)
meta.setdefault("version", version)
meta_loc = output_path / ("model%d" % i) / "meta.json"
util.write_json(meta_loc, meta)
util.set_env_log(verbose)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps)
finally:
with msg.loading(Messages.M061):
with nlp.use_params(optimizer.averages):
final_model_path = output_path / "model-final"
nlp.to_disk(final_model_path)
msg.good(Messages.M066, util.path2str(final_model_path))
_collate_best_model(meta, output_path, nlp.pipe_names)
def _load_vectors(nlp, vectors):
util.load_model(vectors, vocab=nlp.vocab) util.load_model(vectors, vocab=nlp.vocab)
for lex in nlp.vocab: for lex in nlp.vocab:
values = {} values = {}
@ -112,107 +285,17 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
values[lex.vocab.strings[attr]] = func(lex.orth_) values[lex.vocab.strings[attr]] = func(lex.orth_)
lex.set_attrs(**values) lex.set_attrs(**values)
lex.is_oov = False lex.is_oov = False
for name in pipeline:
nlp.add_pipe(nlp.create_pipe(name), name=name)
nlp.add_pipe(nlp.create_pipe('merge_subtokens'))
if parser_multitasks:
for objective in parser_multitasks.split(','):
nlp.parser.add_multitask_objective(objective)
if entity_multitasks:
for objective in entity_multitasks.split(','):
nlp.entity.add_multitask_objective(objective)
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
if init_tok2vec is not None:
loaded = _load_pretrained_tok2vec(nlp, init_tok2vec)
print("Loaded pretrained tok2vec for:", loaded)
nlp._optimizer = None
print("Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS")
try:
for i in range(n_iter):
train_docs = corpus.train_docs(nlp, noise_level=noise_level,
gold_preproc=gold_preproc, max_length=0)
words_seen = 0
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in minibatch_by_words(train_docs, size=batch_sizes):
if not batch:
continue
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
pbar.update(sum(len(doc) for doc in docs))
words_seen += sum(len(doc) for doc in docs)
with nlp.use_params(optimizer.averages):
util.set_env_log(False)
epoch_model_path = output_path / ('model%d' % i)
nlp.to_disk(epoch_model_path)
nlp_loaded = util.load_model_from_path(epoch_model_path)
dev_docs = list(corpus.dev_docs(
nlp_loaded,
gold_preproc=gold_preproc))
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose)
end_time = timer()
if use_gpu < 0:
gpu_wps = None
cpu_wps = nwords/(end_time-start_time)
else:
gpu_wps = nwords/(end_time-start_time)
with Model.use_device('cpu'):
nlp_loaded = util.load_model_from_path(epoch_model_path)
dev_docs = list(corpus.dev_docs(
nlp_loaded, gold_preproc=gold_preproc))
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs)
end_time = timer()
cpu_wps = nwords/(end_time-start_time)
acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores
meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
'gpu': gpu_wps}
meta['vectors'] = {'width': nlp.vocab.vectors_length,
'vectors': len(nlp.vocab.vectors),
'keys': nlp.vocab.vectors.n_keys}
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
meta.setdefault('name', 'model%d' % i)
meta.setdefault('version', version)
with meta_loc.open('w') as file_:
file_.write(json_dumps(meta))
util.set_env_log(True)
print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps,
gpu_wps=gpu_wps)
finally:
print("Saving model...")
with nlp.use_params(optimizer.averages):
final_model_path = output_path / 'model-final'
nlp.to_disk(final_model_path)
components = []
if not no_parser:
components.append('parser')
if not no_tagger:
components.append('tagger')
if not no_entities:
components.append('ner')
_collate_best_model(meta, output_path, components)
def _load_pretrained_tok2vec(nlp, loc): def _load_pretrained_tok2vec(nlp, loc):
"""Load pre-trained weights for the 'token-to-vector' part of the component """Load pre-trained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental. models, which is typically a CNN. See 'spacy pretrain'. Experimental.
""" """
with loc.open('rb') as file_: with loc.open("rb") as file_:
weights_data = file_.read() weights_data = file_.read()
loaded = [] loaded = []
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'): if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
component.tok2vec.from_bytes(weights_data) component.tok2vec.from_bytes(weights_data)
loaded.append(name) loaded.append(name)
return loaded return loaded
@ -222,24 +305,22 @@ def _collate_best_model(meta, output_path, components):
bests = {} bests = {}
for component in components: for component in components:
bests[component] = _find_best(output_path, component) bests[component] = _find_best(output_path, component)
best_dest = output_path / 'model-best' best_dest = output_path / "model-best"
shutil.copytree(output_path / 'model-final', best_dest) shutil.copytree(output_path / "model-final", best_dest)
for component, best_component_src in bests.items(): for component, best_component_src in bests.items():
shutil.rmtree(best_dest / component) shutil.rmtree(best_dest / component)
shutil.copytree(best_component_src / component, best_dest / component) shutil.copytree(best_component_src / component, best_dest / component)
with (best_component_src / 'accuracy.json').open() as file_: accs = util.read_json(best_component_src / "accuracy.json")
accs = json.load(file_)
for metric in _get_metrics(component): for metric in _get_metrics(component):
meta['accuracy'][metric] = accs[metric] meta["accuracy"][metric] = accs[metric]
with (best_dest / 'meta.json').open('w') as file_: util.write_json(best_dest / "meta.json", meta)
file_.write(json_dumps(meta))
def _find_best(experiment_dir, component): def _find_best(experiment_dir, component):
accuracies = [] accuracies = []
for epoch_model in experiment_dir.iterdir(): for epoch_model in experiment_dir.iterdir():
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
accs = json.load((epoch_model / "accuracy.json").open()) accs = util.read_json(epoch_model / "accuracy.json")
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
accuracies.append((scores, epoch_model)) accuracies.append((scores, epoch_model))
if accuracies: if accuracies:
@ -247,6 +328,7 @@ def _find_best(experiment_dir, component):
else: else:
return None return None
def _get_metrics(component): def _get_metrics(component):
if component == "parser": if component == "parser":
return ("las", "uas", "token_acc") return ("las", "uas", "token_acc")
@ -257,50 +339,40 @@ def _get_metrics(component):
return ("token_acc",) return ("token_acc",)
def _render_parses(i, to_render):
to_render[0].user_data['title'] = "Batch %d" % i
with Path('/tmp/entities.html').open('w') as file_:
html = displacy.render(to_render[:5], style='ent', page=True)
file_.write(html)
with Path('/tmp/parses.html').open('w') as file_:
html = displacy.render(to_render[:5], style='dep', page=True)
file_.write(html)
def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0): def print_progress(itn, losses, dev_scores, cpu_wps=0.0, gpu_wps=0.0):
scores = {} scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', for col in [
'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: "dep_loss",
"tag_loss",
"uas",
"tags_acc",
"token_acc",
"ents_p",
"ents_r",
"ents_f",
"cpu_wps",
"gpu_wps",
]:
scores[col] = 0.0 scores[col] = 0.0
scores['dep_loss'] = losses.get('parser', 0.0) scores["dep_loss"] = losses.get("parser", 0.0)
scores['ner_loss'] = losses.get('ner', 0.0) scores["ner_loss"] = losses.get("ner", 0.0)
scores['tag_loss'] = losses.get('tagger', 0.0) scores["tag_loss"] = losses.get("tagger", 0.0)
scores.update(dev_scores) scores.update(dev_scores)
scores['cpu_wps'] = cpu_wps scores["cpu_wps"] = cpu_wps
scores['gpu_wps'] = gpu_wps or 0.0 scores["gpu_wps"] = gpu_wps or 0.0
tpl = ''.join(( tpl = "".join(
'{:<6d}', (
'{dep_loss:<10.3f}', "{:<6d}",
'{ner_loss:<10.3f}', "{dep_loss:<10.3f}",
'{uas:<8.3f}', "{ner_loss:<10.3f}",
'{ents_p:<8.3f}', "{uas:<8.3f}",
'{ents_r:<8.3f}', "{ents_p:<8.3f}",
'{ents_f:<8.3f}', "{ents_r:<8.3f}",
'{tags_acc:<8.3f}', "{ents_f:<8.3f}",
'{token_acc:<9.3f}', "{tags_acc:<8.3f}",
'{cpu_wps:<9.1f}', "{token_acc:<9.3f}",
'{gpu_wps:.1f}', "{cpu_wps:<9.1f}",
)) "{gpu_wps:.1f}",
)
)
print(tpl.format(itn, **scores)) print(tpl.format(itn, **scores))
def print_results(scorer):
results = {
'TOK': '%.2f' % scorer.token_acc,
'POS': '%.2f' % scorer.tags_acc,
'UAS': '%.2f' % scorer.uas,
'LAS': '%.2f' % scorer.las,
'NER P': '%.2f' % scorer.ents_p,
'NER R': '%.2f' % scorer.ents_r,
'NER F': '%.2f' % scorer.ents_f}
util.print_table(results, title="Results")

2
spacy/cli/ud/__init__.py Normal file
View File

@ -0,0 +1,2 @@
from .conll17_ud_eval import main as ud_evaluate # noqa: F401
from .ud_train import main as ud_train # noqa: F401

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# flake8: noqa
# CoNLL 2017 UD Parsing evaluation script. # CoNLL 2017 UD Parsing evaluation script.
# #

View File

@ -1,7 +1,9 @@
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes # flake8: noqa
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used. .conllu format for development data, allowing the official scorer to be used.
''' """
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac
import tqdm import tqdm
from pathlib import Path from pathlib import Path
@ -11,15 +13,17 @@ import json
import spacy import spacy
import spacy.util import spacy.util
from ..tokens import Token, Doc from ...tokens import Token, Doc
from ..gold import GoldParse from ...gold import GoldParse
from ..util import compounding, minibatch_by_words from ...util import compounding, minibatch_by_words
from ..syntax.nonproj import projectivize from ...syntax.nonproj import projectivize
from ..matcher import Matcher from ...matcher import Matcher
#from ..morphology import Fused_begin, Fused_inside
from .. import displacy # from ...morphology import Fused_begin, Fused_inside
from ... import displacy
from collections import defaultdict, Counter from collections import defaultdict, Counter
from timeit import default_timer as timer from timeit import default_timer as timer
Fused_begin = None Fused_begin = None
Fused_inside = None Fused_inside = None
@ -30,43 +34,45 @@ import cytoolz
from . import conll17_ud_eval from . import conll17_ud_eval
from .. import lang from ... import lang
from .. import lang from ...lang import zh
from ..lang import zh from ...lang import ja
from ..lang import ja from ...lang import ru
from ..lang import ru
################ ################
# Data reading # # Data reading #
################ ################
space_re = re.compile('\s+') space_re = re.compile("\s+")
def split_text(text): def split_text(text):
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')] return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
############## ##############
# Evaluation # # Evaluation #
############## ##############
def read_conllu(file_): def read_conllu(file_):
docs = [] docs = []
sent = [] sent = []
doc = [] doc = []
for line in file_: for line in file_:
if line.startswith('# newdoc'): if line.startswith("# newdoc"):
if doc: if doc:
docs.append(doc) docs.append(doc)
doc = [] doc = []
elif line.startswith('#'): elif line.startswith("#"):
continue continue
elif not line.strip(): elif not line.strip():
if sent: if sent:
doc.append(sent) doc.append(sent)
sent = [] sent = []
else: else:
sent.append(list(line.strip().split('\t'))) sent.append(list(line.strip().split("\t")))
if len(sent[-1]) != 10: if len(sent[-1]) != 10:
print(repr(line)) print(repr(line))
raise ValueError raise ValueError
@ -78,7 +84,7 @@ def read_conllu(file_):
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
if text_loc.parts[-1].endswith('.conllu'): if text_loc.parts[-1].endswith(".conllu"):
docs = [] docs = []
with text_loc.open() as file_: with text_loc.open() as file_:
for conllu_doc in read_conllu(file_): for conllu_doc in read_conllu(file_):
@ -88,14 +94,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
docs = list(component.pipe(docs)) docs = list(component.pipe(docs))
else: else:
with text_loc.open('r', encoding='utf8') as text_file: with text_loc.open("r", encoding="utf8") as text_file:
texts = split_text(text_file.read()) texts = split_text(text_file.read())
docs = list(nlp.pipe(texts)) docs = list(nlp.pipe(texts))
with sys_loc.open('w', encoding='utf8') as out_file: with sys_loc.open("w", encoding="utf8") as out_file:
write_conllu(docs, out_file) write_conllu(docs, out_file)
with gold_loc.open('r', encoding='utf8') as gold_file: with gold_loc.open("r", encoding="utf8") as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file) gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open('r', encoding='utf8') as sys_file: with sys_loc.open("r", encoding="utf8") as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file) sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return docs, scores return docs, scores
@ -103,26 +109,26 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
def write_conllu(docs, file_): def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab) merger = Matcher(docs[0].vocab)
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}]) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
matches = merger(doc) matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans] offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets: for start_char, end_char in offsets:
doc.merge(start_char, end_char) doc.merge(start_char, end_char)
# TODO: This shuldn't be necessary? Should be handled in merge # TODO: This shuldn't be necessary? Should be handled in merge
for word in doc: for word in doc:
if word.i == word.head.i: if word.i == word.head.i:
word.dep_ = 'ROOT' word.dep_ = "ROOT"
file_.write("# newdoc id = {i}\n".format(i=i)) file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text)) file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent): for k, token in enumerate(sent):
file_.write(_get_token_conllu(token, k, len(sent)) + '\n') file_.write(_get_token_conllu(token, k, len(sent)) + "\n")
file_.write('\n') file_.write("\n")
for word in sent: for word in sent:
if word.head.i == word.i and word.dep_ == 'ROOT': if word.head.i == word.i and word.dep_ == "ROOT":
break break
else: else:
print("Rootless sentence!") print("Rootless sentence!")
@ -134,24 +140,34 @@ def write_conllu(docs, file_):
def _get_token_conllu(token, k, sent_len): def _get_token_conllu(token, k, sent_len):
if token.check_morph(Fused_begin) and (k+1 < sent_len): if token.check_morph(Fused_begin) and (k + 1 < sent_len):
n = 1 n = 1
text = [token.text] text = [token.text]
while token.nbor(n).check_morph(Fused_inside): while token.nbor(n).check_morph(Fused_inside):
text.append(token.nbor(n).text) text.append(token.nbor(n).text)
n += 1 n += 1
id_ = '%d-%d' % (k+1, (k+n)) id_ = "%d-%d" % (k + 1, (k + n))
fields = [id_, ''.join(text)] + ['_'] * 8 fields = [id_, "".join(text)] + ["_"] * 8
lines = ['\t'.join(fields)] lines = ["\t".join(fields)]
else: else:
lines = [] lines = []
if token.head.i == token.i: if token.head.i == token.i:
head = 0 head = 0
else: else:
head = k + (token.head.i - token.i) + 1 head = k + (token.head.i - token.i) + 1
fields = [str(k+1), token.text, token.lemma_, token.pos_, token.tag_, '_', fields = [
str(head), token.dep_.lower(), '_', '_'] str(k + 1),
if token.check_morph(Fused_begin) and (k+1 < sent_len): token.text,
token.lemma_,
token.pos_,
token.tag_,
"_",
str(head),
token.dep_.lower(),
"_",
"_",
]
if token.check_morph(Fused_begin) and (k + 1 < sent_len):
if k == 0: if k == 0:
fields[1] = token.norm_[0].upper() + token.norm_[1:] fields[1] = token.norm_[0].upper() + token.norm_[1:]
else: else:
@ -163,18 +179,18 @@ def _get_token_conllu(token, k, sent_len):
split_end = token._.split_end split_end = token._.split_end
split_len = (split_end.i - split_start.i) + 1 split_len = (split_end.i - split_start.i) + 1
n_in_split = token.i - split_start.i n_in_split = token.i - split_start.i
subtokens = guess_fused_orths(split_start.text, [''] * split_len) subtokens = guess_fused_orths(split_start.text, [""] * split_len)
fields[1] = subtokens[n_in_split] fields[1] = subtokens[n_in_split]
lines.append('\t'.join(fields)) lines.append("\t".join(fields))
return '\n'.join(lines) return "\n".join(lines)
def guess_fused_orths(word, ud_forms): def guess_fused_orths(word, ud_forms):
'''The UD data 'fused tokens' don't necessarily expand to keys that match """The UD data 'fused tokens' don't necessarily expand to keys that match
the form. We need orths that exact match the string. Here we make a best the form. We need orths that exact match the string. Here we make a best
effort to divide up the word.''' effort to divide up the word."""
if word == ''.join(ud_forms): if word == "".join(ud_forms):
# Happy case: we get a perfect split, with each letter accounted for. # Happy case: we get a perfect split, with each letter accounted for.
return ud_forms return ud_forms
elif len(word) == sum(len(subtoken) for subtoken in ud_forms): elif len(word) == sum(len(subtoken) for subtoken in ud_forms):
@ -183,16 +199,16 @@ def guess_fused_orths(word, ud_forms):
remain = word remain = word
for subtoken in ud_forms: for subtoken in ud_forms:
assert len(subtoken) >= 1 assert len(subtoken) >= 1
output.append(remain[:len(subtoken)]) output.append(remain[: len(subtoken)])
remain = remain[len(subtoken):] remain = remain[len(subtoken) :]
assert len(remain) == 0, (word, ud_forms, remain) assert len(remain) == 0, (word, ud_forms, remain)
return output return output
else: else:
# Let's say word is 6 long, and there are three subtokens. The orths # Let's say word is 6 long, and there are three subtokens. The orths
# *must* equal the original string. Arbitrarily, split [4, 1, 1] # *must* equal the original string. Arbitrarily, split [4, 1, 1]
first = word[:len(word)-(len(ud_forms)-1)] first = word[: len(word) - (len(ud_forms) - 1)]
output = [first] output = [first]
remain = word[len(first):] remain = word[len(first) :]
for i in range(1, len(ud_forms)): for i in range(1, len(ud_forms)):
assert remain assert remain
output.append(remain[:1]) output.append(remain[:1])
@ -201,59 +217,49 @@ def guess_fused_orths(word, ud_forms):
return output return output
def print_results(name, ud_scores): def print_results(name, ud_scores):
fields = {} fields = {}
if ud_scores is not None: if ud_scores is not None:
fields.update({ fields.update(
'words': ud_scores['Words'].f1 * 100, {
'sents': ud_scores['Sentences'].f1 * 100, "words": ud_scores["Words"].f1 * 100,
'tags': ud_scores['XPOS'].f1 * 100, "sents": ud_scores["Sentences"].f1 * 100,
'uas': ud_scores['UAS'].f1 * 100, "tags": ud_scores["XPOS"].f1 * 100,
'las': ud_scores['LAS'].f1 * 100, "uas": ud_scores["UAS"].f1 * 100,
}) "las": ud_scores["LAS"].f1 * 100,
}
)
else: else:
fields.update({ fields.update({"words": 0.0, "sents": 0.0, "tags": 0.0, "uas": 0.0, "las": 0.0})
'words': 0.0, tpl = "\t".join(
'sents': 0.0, (name, "{las:.1f}", "{uas:.1f}", "{tags:.1f}", "{sents:.1f}", "{words:.1f}")
'tags': 0.0, )
'uas': 0.0,
'las': 0.0
})
tpl = '\t'.join((
name,
'{las:.1f}',
'{uas:.1f}',
'{tags:.1f}',
'{sents:.1f}',
'{words:.1f}',
))
print(tpl.format(**fields)) print(tpl.format(**fields))
return fields return fields
def get_token_split_start(token): def get_token_split_start(token):
if token.text == '': if token.text == "":
assert token.i != 0 assert token.i != 0
i = -1 i = -1
while token.nbor(i).text == '': while token.nbor(i).text == "":
i -= 1 i -= 1
return token.nbor(i) return token.nbor(i)
elif (token.i+1) < len(token.doc) and token.nbor(1).text == '': elif (token.i + 1) < len(token.doc) and token.nbor(1).text == "":
return token return token
else: else:
return None return None
def get_token_split_end(token): def get_token_split_end(token):
if (token.i+1) == len(token.doc): if (token.i + 1) == len(token.doc):
return token if token.text == '' else None return token if token.text == "" else None
elif token.text != '' and token.nbor(1).text != '': elif token.text != "" and token.nbor(1).text != "":
return None return None
i = 1 i = 1
while (token.i+i) < len(token.doc) and token.nbor(i).text == '': while (token.i + i) < len(token.doc) and token.nbor(i).text == "":
i += 1 i += 1
return token.nbor(i-1) return token.nbor(i - 1)
################## ##################
@ -262,54 +268,73 @@ def get_token_split_end(token):
def load_nlp(experiments_dir, corpus): def load_nlp(experiments_dir, corpus):
nlp = spacy.load(experiments_dir / corpus / 'best-model') nlp = spacy.load(experiments_dir / corpus / "best-model")
return nlp return nlp
def initialize_pipeline(nlp, docs, golds, config, device): def initialize_pipeline(nlp, docs, golds, config, device):
nlp.add_pipe(nlp.create_pipe('parser')) nlp.add_pipe(nlp.create_pipe("parser"))
return nlp return nlp
@plac.annotations( @plac.annotations(
test_data_dir=("Path to Universal Dependencies test data", "positional", None, Path), test_data_dir=(
"Path to Universal Dependencies test data",
"positional",
None,
Path,
),
experiment_dir=("Parent directory with output model", "positional", None, Path), experiment_dir=("Parent directory with output model", "positional", None, Path),
corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str), corpus=(
"UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc",
"positional",
None,
str,
),
) )
def main(test_data_dir, experiment_dir, corpus): def main(test_data_dir, experiment_dir, corpus):
Token.set_extension('split_start', getter=get_token_split_start) Token.set_extension("split_start", getter=get_token_split_start)
Token.set_extension('split_end', getter=get_token_split_end) Token.set_extension("split_end", getter=get_token_split_end)
Token.set_extension('begins_fused', default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension('inside_fused', default=False) Token.set_extension("inside_fused", default=False)
lang.zh.Chinese.Defaults.use_jieba = False lang.zh.Chinese.Defaults.use_jieba = False
lang.ja.Japanese.Defaults.use_janome = False lang.ja.Japanese.Defaults.use_janome = False
lang.ru.Russian.Defaults.use_pymorphy2 = False lang.ru.Russian.Defaults.use_pymorphy2 = False
nlp = load_nlp(experiment_dir, corpus) nlp = load_nlp(experiment_dir, corpus)
treebank_code = nlp.meta['treebank'] treebank_code = nlp.meta["treebank"]
for section in ('test', 'dev'): for section in ("test", "dev"):
if section == 'dev': if section == "dev":
section_dir = 'conll17-ud-development-2017-03-19' section_dir = "conll17-ud-development-2017-03-19"
else: else:
section_dir = 'conll17-ud-test-2017-05-09' section_dir = "conll17-ud-test-2017-05-09"
text_path = test_data_dir / 'input' / section_dir / (treebank_code+'.txt') text_path = test_data_dir / "input" / section_dir / (treebank_code + ".txt")
udpipe_path = test_data_dir / 'input' / section_dir / (treebank_code+'-udpipe.conllu') udpipe_path = (
gold_path = test_data_dir / 'gold' / section_dir / (treebank_code+'.conllu') test_data_dir / "input" / section_dir / (treebank_code + "-udpipe.conllu")
)
gold_path = test_data_dir / "gold" / section_dir / (treebank_code + ".conllu")
header = [section, 'LAS', 'UAS', 'TAG', 'SENT', 'WORD'] header = [section, "LAS", "UAS", "TAG", "SENT", "WORD"]
print('\t'.join(header)) print("\t".join(header))
inputs = {'gold': gold_path, 'udp': udpipe_path, 'raw': text_path} inputs = {"gold": gold_path, "udp": udpipe_path, "raw": text_path}
for input_type in ('udp', 'raw'): for input_type in ("udp", "raw"):
input_path = inputs[input_type] input_path = inputs[input_type]
output_path = experiment_dir / corpus / '{section}.conllu'.format(section=section) output_path = (
experiment_dir / corpus / "{section}.conllu".format(section=section)
)
parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path) parsed_docs, test_scores = evaluate(nlp, input_path, gold_path, output_path)
accuracy = print_results(input_type, test_scores) accuracy = print_results(input_type, test_scores)
acc_path = experiment_dir / corpus / '{section}-accuracy.json'.format(section=section) acc_path = (
with open(acc_path, 'w') as file_: experiment_dir
/ corpus
/ "{section}-accuracy.json".format(section=section)
)
with open(acc_path, "w") as file_:
file_.write(json.dumps(accuracy, indent=2)) file_.write(json.dumps(accuracy, indent=2))
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -1,7 +1,9 @@
'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes # flake8: noqa
"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used. .conllu format for development data, allowing the official scorer to be used.
''' """
from __future__ import unicode_literals from __future__ import unicode_literals
import plac import plac
import tqdm import tqdm
from pathlib import Path from pathlib import Path
@ -11,12 +13,12 @@ import json
import spacy import spacy
import spacy.util import spacy.util
from ..tokens import Token, Doc from ...tokens import Token, Doc
from ..gold import GoldParse from ...gold import GoldParse
from ..util import compounding, minibatch, minibatch_by_words from ...util import compounding, minibatch, minibatch_by_words
from ..syntax.nonproj import projectivize from ...syntax.nonproj import projectivize
from ..matcher import Matcher from ...matcher import Matcher
from .. import displacy from ... import displacy
from collections import defaultdict, Counter from collections import defaultdict, Counter
from timeit import default_timer as timer from timeit import default_timer as timer
@ -27,10 +29,9 @@ import cytoolz
from . import conll17_ud_eval from . import conll17_ud_eval
from .. import lang from ... import lang
from .. import lang from ...lang import zh
from ..lang import zh from ...lang import ja
from ..lang import ja
try: try:
import torch import torch
@ -42,17 +43,26 @@ except ImportError:
# Data reading # # Data reading #
################ ################
space_re = re.compile('\s+') space_re = re.compile("\s+")
def split_text(text): def split_text(text):
return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')] return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False, def read_data(
max_doc_length=None, limit=None): nlp,
'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True, conllu_file,
text_file,
raw_text=True,
oracle_segments=False,
max_doc_length=None,
limit=None,
):
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True.''' created from the gold-standard segments. At least one must be True."""
if not raw_text and not oracle_segments: if not raw_text and not oracle_segments:
raise ValueError("At least one of raw_text or oracle_segments must be True") raise ValueError("At least one of raw_text or oracle_segments must be True")
paragraphs = split_text(text_file.read()) paragraphs = split_text(text_file.read())
@ -66,22 +76,21 @@ def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
for cs in cd: for cs in cd:
sent = defaultdict(list) sent = defaultdict(list)
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs: for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
if '.' in id_: if "." in id_:
continue continue
if '-' in id_: if "-" in id_:
continue continue
id_ = int(id_)-1 id_ = int(id_) - 1
head = int(head)-1 if head != '0' else id_ head = int(head) - 1 if head != "0" else id_
sent['words'].append(word) sent["words"].append(word)
sent['tags'].append(tag) sent["tags"].append(tag)
sent['heads'].append(head) sent["heads"].append(head)
sent['deps'].append('ROOT' if dep == 'root' else dep) sent["deps"].append("ROOT" if dep == "root" else dep)
sent['spaces'].append(space_after == '_') sent["spaces"].append(space_after == "_")
sent['entities'] = ['-'] * len(sent['words']) sent["entities"] = ["-"] * len(sent["words"])
sent['heads'], sent['deps'] = projectivize(sent['heads'], sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
sent['deps'])
if oracle_segments: if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces'])) docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent)) golds.append(GoldParse(docs[-1], **sent))
sent_annots.append(sent) sent_annots.append(sent)
@ -107,18 +116,18 @@ def read_conllu(file_):
sent = [] sent = []
doc = [] doc = []
for line in file_: for line in file_:
if line.startswith('# newdoc'): if line.startswith("# newdoc"):
if doc: if doc:
docs.append(doc) docs.append(doc)
doc = [] doc = []
elif line.startswith('#'): elif line.startswith("#"):
continue continue
elif not line.strip(): elif not line.strip():
if sent: if sent:
doc.append(sent) doc.append(sent)
sent = [] sent = []
else: else:
sent.append(list(line.strip().split('\t'))) sent.append(list(line.strip().split("\t")))
if len(sent[-1]) != 10: if len(sent[-1]) != 10:
print(repr(line)) print(repr(line))
raise ValueError raise ValueError
@ -134,17 +143,19 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
flat = defaultdict(list) flat = defaultdict(list)
sent_starts = [] sent_starts = []
for sent in sent_annots: for sent in sent_annots:
flat['heads'].extend(len(flat['words'])+head for head in sent['heads']) flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
for field in ['words', 'tags', 'deps', 'entities', 'spaces']: for field in ["words", "tags", "deps", "entities", "spaces"]:
flat[field].extend(sent[field]) flat[field].extend(sent[field])
sent_starts.append(True) sent_starts.append(True)
sent_starts.extend([False] * (len(sent['words'])-1)) sent_starts.extend([False] * (len(sent["words"]) - 1))
# Construct text if necessary # Construct text if necessary
assert len(flat['words']) == len(flat['spaces']) assert len(flat["words"]) == len(flat["spaces"])
if text is None: if text is None:
text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) text = "".join(
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
)
doc = nlp.make_doc(text) doc = nlp.make_doc(text)
flat.pop('spaces') flat.pop("spaces")
gold = GoldParse(doc, **flat) gold = GoldParse(doc, **flat)
gold.sent_starts = sent_starts gold.sent_starts = sent_starts
for i in range(len(gold.heads)): for i in range(len(gold.heads)):
@ -154,13 +165,15 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
return doc, gold return doc, gold
############################# #############################
# Data transforms for spaCy # # Data transforms for spaCy #
############################# #############################
def golds_to_gold_tuples(docs, golds): def golds_to_gold_tuples(docs, golds):
'''Get out the annoying 'tuples' format used by begin_training, given the """Get out the annoying 'tuples' format used by begin_training, given the
GoldParse objects.''' GoldParse objects."""
tuples = [] tuples = []
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
text = doc.text text = doc.text
@ -174,8 +187,9 @@ def golds_to_gold_tuples(docs, golds):
# Evaluation # # Evaluation #
############## ##############
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None): def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
if text_loc.parts[-1].endswith('.conllu'): if text_loc.parts[-1].endswith(".conllu"):
docs = [] docs = []
with text_loc.open() as file_: with text_loc.open() as file_:
for conllu_doc in read_conllu(file_): for conllu_doc in read_conllu(file_):
@ -185,14 +199,14 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
docs = list(component.pipe(docs)) docs = list(component.pipe(docs))
else: else:
with text_loc.open('r', encoding='utf8') as text_file: with text_loc.open("r", encoding="utf8") as text_file:
texts = split_text(text_file.read()) texts = split_text(text_file.read())
docs = list(nlp.pipe(texts)) docs = list(nlp.pipe(texts))
with sys_loc.open('w', encoding='utf8') as out_file: with sys_loc.open("w", encoding="utf8") as out_file:
write_conllu(docs, out_file) write_conllu(docs, out_file)
with gold_loc.open('r', encoding='utf8') as gold_file: with gold_loc.open("r", encoding="utf8") as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file) gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open('r', encoding='utf8') as sys_file: with sys_loc.open("r", encoding="utf8") as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file) sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud) scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return docs, scores return docs, scores
@ -200,10 +214,10 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
def write_conllu(docs, file_): def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab) merger = Matcher(docs[0].vocab)
merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}]) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
matches = merger(doc) matches = merger(doc)
spans = [doc[start:end+1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans] offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets: for start_char, end_char in offsets:
doc.merge(start_char, end_char) doc.merge(start_char, end_char)
@ -213,65 +227,82 @@ def write_conllu(docs, file_):
file_.write("# text = {text}\n".format(text=sent.text)) file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent): for k, token in enumerate(sent):
if token.head.i > sent[-1].i or token.head.i < sent[0].i: if token.head.i > sent[-1].i or token.head.i < sent[0].i:
for word in doc[sent[0].i-10 : sent[0].i]: for word in doc[sent[0].i - 10 : sent[0].i]:
print(word.i, word.head.i, word.text, word.dep_) print(word.i, word.head.i, word.text, word.dep_)
for word in sent: for word in sent:
print(word.i, word.head.i, word.text, word.dep_) print(word.i, word.head.i, word.text, word.dep_)
for word in doc[sent[-1].i : sent[-1].i+10]: for word in doc[sent[-1].i : sent[-1].i + 10]:
print(word.i, word.head.i, word.text, word.dep_) print(word.i, word.head.i, word.text, word.dep_)
raise ValueError("Invalid parse: head outside sentence (%s)" % token.text) raise ValueError(
file_.write(token._.get_conllu_lines(k) + '\n') "Invalid parse: head outside sentence (%s)" % token.text
file_.write('\n') )
file_.write(token._.get_conllu_lines(k) + "\n")
file_.write("\n")
def print_progress(itn, losses, ud_scores): def print_progress(itn, losses, ud_scores):
fields = { fields = {
'dep_loss': losses.get('parser', 0.0), "dep_loss": losses.get("parser", 0.0),
'tag_loss': losses.get('tagger', 0.0), "tag_loss": losses.get("tagger", 0.0),
'words': ud_scores['Words'].f1 * 100, "words": ud_scores["Words"].f1 * 100,
'sents': ud_scores['Sentences'].f1 * 100, "sents": ud_scores["Sentences"].f1 * 100,
'tags': ud_scores['XPOS'].f1 * 100, "tags": ud_scores["XPOS"].f1 * 100,
'uas': ud_scores['UAS'].f1 * 100, "uas": ud_scores["UAS"].f1 * 100,
'las': ud_scores['LAS'].f1 * 100, "las": ud_scores["LAS"].f1 * 100,
} }
header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD'] header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
if itn == 0: if itn == 0:
print('\t'.join(header)) print("\t".join(header))
tpl = '\t'.join(( tpl = "\t".join(
'{:d}', (
'{dep_loss:.1f}', "{:d}",
'{las:.1f}', "{dep_loss:.1f}",
'{uas:.1f}', "{las:.1f}",
'{tags:.1f}', "{uas:.1f}",
'{sents:.1f}', "{tags:.1f}",
'{words:.1f}', "{sents:.1f}",
)) "{words:.1f}",
)
)
print(tpl.format(itn, **fields)) print(tpl.format(itn, **fields))
#def get_sent_conllu(sent, sent_id):
# def get_sent_conllu(sent, sent_id):
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)] # lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
def get_token_conllu(token, i): def get_token_conllu(token, i):
if token._.begins_fused: if token._.begins_fused:
n = 1 n = 1
while token.nbor(n)._.inside_fused: while token.nbor(n)._.inside_fused:
n += 1 n += 1
id_ = '%d-%d' % (i, i+n) id_ = "%d-%d" % (i, i + n)
lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_'] lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
else: else:
lines = [] lines = []
if token.head.i == token.i: if token.head.i == token.i:
head = 0 head = 0
else: else:
head = i + (token.head.i - token.i) + 1 head = i + (token.head.i - token.i) + 1
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_', fields = [
str(head), token.dep_.lower(), '_', '_'] str(i + 1),
lines.append('\t'.join(fields)) token.text,
return '\n'.join(lines) token.lemma_,
token.pos_,
token.tag_,
"_",
str(head),
token.dep_.lower(),
"_",
"_",
]
lines.append("\t".join(fields))
return "\n".join(lines)
Token.set_extension('get_conllu_lines', method=get_token_conllu)
Token.set_extension('begins_fused', default=False) Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension('inside_fused', default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
################## ##################
@ -280,35 +311,40 @@ Token.set_extension('inside_fused', default=False)
def load_nlp(corpus, config, vectors=None): def load_nlp(corpus, config, vectors=None):
lang = corpus.split('_')[0] lang = corpus.split("_")[0]
nlp = spacy.blank(lang) nlp = spacy.blank(lang)
if config.vectors: if config.vectors:
if not vectors: if not vectors:
raise ValueError("config asks for vectors, but no vectors " raise ValueError(
"directory set on command line (use -v)") "config asks for vectors, but no vectors "
"directory set on command line (use -v)"
)
if (Path(vectors) / corpus).exists(): if (Path(vectors) / corpus).exists():
nlp.vocab.from_disk(Path(vectors) / corpus / 'vocab') nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
nlp.meta['treebank'] = corpus nlp.meta["treebank"] = corpus
return nlp return nlp
def initialize_pipeline(nlp, docs, golds, config, device): def initialize_pipeline(nlp, docs, golds, config, device):
nlp.add_pipe(nlp.create_pipe('tagger')) nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.add_pipe(nlp.create_pipe('parser')) nlp.add_pipe(nlp.create_pipe("parser"))
if config.multitask_tag: if config.multitask_tag:
nlp.parser.add_multitask_objective('tag') nlp.parser.add_multitask_objective("tag")
if config.multitask_sent: if config.multitask_sent:
nlp.parser.add_multitask_objective('sent_start') nlp.parser.add_multitask_objective("sent_start")
for gold in golds: for gold in golds:
for tag in gold.tags: for tag in gold.tags:
if tag is not None: if tag is not None:
nlp.tagger.add_label(tag) nlp.tagger.add_label(tag)
if torch is not None and device != -1: if torch is not None and device != -1:
torch.set_default_tensor_type('torch.cuda.FloatTensor') torch.set_default_tensor_type("torch.cuda.FloatTensor")
optimizer = nlp.begin_training( optimizer = nlp.begin_training(
lambda: golds_to_gold_tuples(docs, golds), device=device, lambda: golds_to_gold_tuples(docs, golds),
subword_features=config.subword_features, conv_depth=config.conv_depth, device=device,
bilstm_depth=config.bilstm_depth) subword_features=config.subword_features,
conv_depth=config.conv_depth,
bilstm_depth=config.bilstm_depth,
)
if config.pretrained_tok2vec: if config.pretrained_tok2vec:
_load_pretrained_tok2vec(nlp, config.pretrained_tok2vec) _load_pretrained_tok2vec(nlp, config.pretrained_tok2vec)
return optimizer return optimizer
@ -318,27 +354,41 @@ def _load_pretrained_tok2vec(nlp, loc):
"""Load pre-trained weights for the 'token-to-vector' part of the component """Load pre-trained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental. models, which is typically a CNN. See 'spacy pretrain'. Experimental.
""" """
with Path(loc).open('rb') as file_: with Path(loc).open("rb") as file_:
weights_data = file_.read() weights_data = file_.read()
loaded = [] loaded = []
for name, component in nlp.pipeline: for name, component in nlp.pipeline:
if hasattr(component, 'model') and hasattr(component.model, 'tok2vec'): if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
component.tok2vec.from_bytes(weights_data) component.tok2vec.from_bytes(weights_data)
loaded.append(name) loaded.append(name)
return loaded return loaded
######################## ########################
# Command line helpers # # Command line helpers #
######################## ########################
class Config(object): class Config(object):
def __init__(self, vectors=None, max_doc_length=10, multitask_tag=False, def __init__(
multitask_sent=False, multitask_dep=False, multitask_vectors=None, self,
bilstm_depth=0, nr_epoch=30, min_batch_size=750, max_batch_size=750, vectors=None,
batch_by_words=True, dropout=0.1, conv_depth=4, subword_features=True, max_doc_length=10,
vectors_dir=None, pretrained_tok2vec=None): multitask_tag=False,
multitask_sent=False,
multitask_dep=False,
multitask_vectors=None,
bilstm_depth=0,
nr_epoch=30,
min_batch_size=100,
max_batch_size=1000,
batch_by_words=True,
dropout=0.2,
conv_depth=4,
subword_features=True,
vectors_dir=None,
pretrained_tok2vec=None,
):
if vectors_dir is not None: if vectors_dir is not None:
if vectors is None: if vectors is None:
vectors = True vectors = True
@ -349,10 +399,10 @@ class Config(object):
@classmethod @classmethod
def load(cls, loc, vectors_dir=None): def load(cls, loc, vectors_dir=None):
with Path(loc).open('r', encoding='utf8') as file_: with Path(loc).open("r", encoding="utf8") as file_:
cfg = json.load(file_) cfg = json.load(file_)
if vectors_dir is not None: if vectors_dir is not None:
cfg['vectors_dir'] = vectors_dir cfg["vectors_dir"] = vectors_dir
return cls(**cfg) return cls(**cfg)
@ -364,39 +414,55 @@ class Dataset(object):
self.text = None self.text = None
for file_path in self.path.iterdir(): for file_path in self.path.iterdir():
name = file_path.parts[-1] name = file_path.parts[-1]
if section in name and name.endswith('conllu'): if section in name and name.endswith("conllu"):
self.conllu = file_path self.conllu = file_path
elif section in name and name.endswith('txt'): elif section in name and name.endswith("txt"):
self.text = file_path self.text = file_path
if self.conllu is None: if self.conllu is None:
msg = "Could not find .txt file in {path} for {section}" msg = "Could not find .txt file in {path} for {section}"
raise IOError(msg.format(section=section, path=path)) raise IOError(msg.format(section=section, path=path))
if self.text is None: if self.text is None:
msg = "Could not find .txt file in {path} for {section}" msg = "Could not find .txt file in {path} for {section}"
self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0] self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
class TreebankPaths(object): class TreebankPaths(object):
def __init__(self, ud_path, treebank, **cfg): def __init__(self, ud_path, treebank, **cfg):
self.train = Dataset(ud_path / treebank, 'train') self.train = Dataset(ud_path / treebank, "train")
self.dev = Dataset(ud_path / treebank, 'dev') self.dev = Dataset(ud_path / treebank, "dev")
self.lang = self.train.lang self.lang = self.train.lang
@plac.annotations( @plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path), ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc", corpus=(
"positional", None, str), "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"positional",
None,
str,
),
parses_dir=("Directory to write the development parses", "positional", None, Path), parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "option", "C", Path), config=("Path to json formatted config file", "option", "C", Path),
limit=("Size limit", "option", "n", int), limit=("Size limit", "option", "n", int),
gpu_device=("Use GPU", "option", "g", int), gpu_device=("Use GPU", "option", "g", int),
use_oracle_segments=("Use oracle segments", "flag", "G", int), use_oracle_segments=("Use oracle segments", "flag", "G", int),
vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/", vectors_dir=(
"option", "v", Path), "Path to directory with pre-trained vectors, named e.g. en/",
"option",
"v",
Path,
),
) )
def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vectors_dir=None, def main(
use_oracle_segments=False): ud_dir,
parses_dir,
corpus,
config=None,
limit=0,
gpu_device=-1,
vectors_dir=None,
use_oracle_segments=False,
):
spacy.util.fix_random_seed() spacy.util.fix_random_seed()
lang.zh.Chinese.Defaults.use_jieba = False lang.zh.Chinese.Defaults.use_jieba = False
lang.ja.Japanese.Defaults.use_janome = False lang.ja.Japanese.Defaults.use_janome = False
@ -411,19 +477,28 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
print("Train and evaluate", corpus, "using lang", paths.lang) print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config, vectors=vectors_dir) nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), docs, golds = read_data(
nlp,
paths.train.conllu.open(),
paths.train.text.open(),
max_doc_length=config.max_doc_length, max_doc_length=config.max_doc_length,
limit=limit) limit=limit,
)
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device) optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001) batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
beam_prob = compounding(0.2, 0.8, 1.001) beam_prob = compounding(0.2, 0.8, 1.001)
for i in range(config.nr_epoch): for i in range(config.nr_epoch):
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(), docs, golds = read_data(
max_doc_length=config.max_doc_length, limit=limit, nlp,
paths.train.conllu.open(),
paths.train.text.open(),
max_doc_length=config.max_doc_length,
limit=limit,
oracle_segments=use_oracle_segments, oracle_segments=use_oracle_segments,
raw_text=not use_oracle_segments) raw_text=not use_oracle_segments,
)
Xs = list(zip(docs, golds)) Xs = list(zip(docs, golds))
random.shuffle(Xs) random.shuffle(Xs)
if config.batch_by_words: if config.batch_by_words:
@ -436,27 +511,34 @@ def main(ud_dir, parses_dir, corpus, config=None, limit=0, gpu_device=-1, vector
for batch in batches: for batch in batches:
batch_docs, batch_gold = zip(*batch) batch_docs, batch_gold = zip(*batch)
pbar.update(sum(len(doc) for doc in batch_docs)) pbar.update(sum(len(doc) for doc in batch_docs))
nlp.parser.cfg['beam_update_prob'] = next(beam_prob) nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
nlp.update(batch_docs, batch_gold, sgd=optimizer, nlp.update(
drop=config.dropout, losses=losses) batch_docs,
batch_gold,
sgd=optimizer,
drop=config.dropout,
losses=losses,
)
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
if use_oracle_segments: if use_oracle_segments:
parsed_docs, scores = evaluate(nlp, paths.dev.conllu, parsed_docs, scores = evaluate(
paths.dev.conllu, out_path) nlp, paths.dev.conllu, paths.dev.conllu, out_path
)
else: else:
parsed_docs, scores = evaluate(nlp, paths.dev.text, parsed_docs, scores = evaluate(
paths.dev.conllu, out_path) nlp, paths.dev.text, paths.dev.conllu, out_path
)
print_progress(i, losses, scores) print_progress(i, losses, scores)
def _render_parses(i, to_render): def _render_parses(i, to_render):
to_render[0].user_data['title'] = "Batch %d" % i to_render[0].user_data["title"] = "Batch %d" % i
with Path('/tmp/parses.html').open('w') as file_: with Path("/tmp/parses.html").open("w") as file_:
html = displacy.render(to_render[:5], style='dep', page=True) html = displacy.render(to_render[:5], style="dep", page=True)
file_.write(html) file_.write(html)
if __name__ == '__main__': if __name__ == "__main__":
plac.call(main) plac.call(main)

View File

@ -4,28 +4,34 @@ from __future__ import unicode_literals, print_function
import pkg_resources import pkg_resources
from pathlib import Path from pathlib import Path
import sys import sys
import ujson
import requests import requests
from wasabi import Printer
from ._messages import Messages from ._messages import Messages
from ..compat import path2str, locale_escape from ..compat import path2str
from ..util import prints, get_data_path, read_json from ..util import get_data_path, read_json
from .. import about from .. import about
def validate(): def validate():
"""Validate that the currently installed version of spaCy is compatible """
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`. with the installed models. Should be run after `pip install -U spacy`.
""" """
msg = Printer()
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__) r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:
prints(Messages.M021, title=Messages.M003.format(code=r.status_code), msg.fail(Messages.M003.format(code=r.status_code), Messages.M021, exits=1)
exits=1) msg.good("Loaded compatibility table")
compat = r.json()['spacy'] compat = r.json()["spacy"]
current_compat = compat.get(about.__version__) current_compat = compat.get(about.__version__)
if not current_compat: if not current_compat:
prints(about.__compatibility__, exits=1, msg.fail(
title=Messages.M022.format(version=about.__version__)) Messages.M022.format(version=about.__version__),
about.__compatibility__,
exits=1,
)
all_models = set() all_models = set()
for spacy_v, models in dict(compat).items(): for spacy_v, models in dict(compat).items():
all_models.update(models.keys()) all_models.update(models.keys())
@ -33,33 +39,38 @@ def validate():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs] compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
model_links = get_model_links(current_compat) model_links = get_model_links(current_compat)
model_pkgs = get_model_pkgs(current_compat, all_models) model_pkgs = get_model_pkgs(current_compat, all_models)
incompat_links = {l for l, d in model_links.items() if not d['compat']} incompat_links = {l for l, d in model_links.items() if not d["compat"]}
incompat_models = {d['name'] for _, d in model_pkgs.items() incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
if not d['compat']} incompat_models.update(
incompat_models.update([d['name'] for _, d in model_links.items() [d["name"] for _, d in model_links.items() if not d["compat"]]
if not d['compat']]) )
na_models = [m for m in incompat_models if m not in current_compat] na_models = [m for m in incompat_models if m not in current_compat]
update_models = [m for m in incompat_models if m in current_compat] update_models = [m for m in incompat_models if m in current_compat]
spacy_dir = Path(__file__).parent.parent
msg.divider(Messages.M023.format(version=about.__version__))
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
prints(path2str(Path(__file__).parent.parent),
title=Messages.M023.format(version=about.__version__))
if model_links or model_pkgs: if model_links or model_pkgs:
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) header = ("TYPE", "NAME", "MODEL", "VERSION", "")
rows = []
for name, data in model_pkgs.items(): for name, data in model_pkgs.items():
print(get_model_row(current_compat, name, data, 'package')) rows.append(get_model_row(current_compat, name, data, msg))
for name, data in model_links.items(): for name, data in model_links.items():
print(get_model_row(current_compat, name, data, 'link')) rows.append(get_model_row(current_compat, name, data, msg, "link"))
msg.table(rows, header=header)
else: else:
prints(Messages.M024, exits=0) msg.text(Messages.M024, exits=0)
if update_models: if update_models:
cmd = ' python -m spacy download {}' msg.divider("Install updates")
print("\n " + Messages.M025) cmd = "python -m spacy download {}"
print('\n'.join([cmd.format(pkg) for pkg in update_models])) print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models: if na_models:
prints(Messages.M025.format(version=about.__version__, msg.text(
models=', '.join(na_models))) Messages.M025.format(version=about.__version__, models=", ".join(na_models))
)
if incompat_links: if incompat_links:
prints(Messages.M027.format(path=path2str(get_data_path()))) msg.text(Messages.M027.format(path=path2str(get_data_path())))
if incompat_models or incompat_links: if incompat_models or incompat_links:
sys.exit(1) sys.exit(1)
@ -70,50 +81,48 @@ def get_model_links(compat):
if data_path: if data_path:
models = [p for p in data_path.iterdir() if is_model_path(p)] models = [p for p in data_path.iterdir() if is_model_path(p)]
for model in models: for model in models:
meta_path = Path(model) / 'meta.json' meta_path = Path(model) / "meta.json"
if not meta_path.exists(): if not meta_path.exists():
continue continue
meta = read_json(meta_path) meta = read_json(meta_path)
link = model.parts[-1] link = model.parts[-1]
name = meta['lang'] + '_' + meta['name'] name = meta["lang"] + "_" + meta["name"]
links[link] = {'name': name, 'version': meta['version'], links[link] = {
'compat': is_compat(compat, name, meta['version'])} "name": name,
"version": meta["version"],
"compat": is_compat(compat, name, meta["version"]),
}
return links return links
def get_model_pkgs(compat, all_models): def get_model_pkgs(compat, all_models):
pkgs = {} pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
package = pkg_name.replace('-', '_') package = pkg_name.replace("-", "_")
if package in all_models: if package in all_models:
version = pkg_data.version version = pkg_data.version
pkgs[pkg_name] = {'name': package, 'version': version, pkgs[pkg_name] = {
'compat': is_compat(compat, package, version)} "name": package,
"version": version,
"compat": is_compat(compat, package, version),
}
return pkgs return pkgs
def get_model_row(compat, name, data, type='package'): def get_model_row(compat, name, data, msg, model_type="package"):
tpl_red = '\x1b[38;5;1m{}\x1b[0m' if data["compat"]:
tpl_green = '\x1b[38;5;2m{}\x1b[0m' comp = msg.text("", color="green", icon="good", no_print=True)
if data['compat']: version = msg.text(data["version"], color="green", no_print=True)
comp = tpl_green.format(locale_escape('', errors='ignore'))
version = tpl_green.format(data['version'])
else: else:
comp = '--> {}'.format(compat.get(data['name'], ['n/a'])[0]) version = msg.text(data["version"], color="red", no_print=True)
version = tpl_red.format(data['version']) comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
return get_row(type, name, data['name'], version, comp) return (model_type, name, data["name"], version, comp)
def get_row(*args):
tpl_row = ' {:<10}' + (' {:<20}' * 4)
return tpl_row.format(*args)
def is_model_path(model_path): def is_model_path(model_path):
exclude = ['cache', 'pycache', '__pycache__'] exclude = ["cache", "pycache", "__pycache__"]
name = model_path.parts[-1] name = model_path.parts[-1]
return (model_path.is_dir() and name not in exclude return model_path.is_dir() and name not in exclude and not name.startswith(".")
and not name.startswith('.'))
def is_compat(compat, name, version): def is_compat(compat, name, version):
@ -122,6 +131,6 @@ def is_compat(compat, name, version):
def reformat_version(version): def reformat_version(version):
"""Hack to reformat old versions ending on '-alpha' to match pip format.""" """Hack to reformat old versions ending on '-alpha' to match pip format."""
if version.endswith('-alpha'): if version.endswith("-alpha"):
return version.replace('-alpha', 'a0') return version.replace("-alpha", "a0")
return version.replace('-alpha', 'a') return version.replace("-alpha", "a")

View File

@ -1,59 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import json
import spacy
import numpy
from pathlib import Path
from ..vectors import Vectors
from ..util import prints, ensure_path
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("model output directory", "positional", None, Path),
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
None, Path),
vectors_loc=("optional: location of vectors data, as numpy .npz",
"positional", None, str),
prune_vectors=("optional: number of vectors to prune to.",
"option", "V", int)
)
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1):
"""Compile a vocabulary from a lexicon jsonl file and word vectors."""
if not lexemes_loc.exists():
prints(lexemes_loc, title="Can't find lexical data", exits=1)
vectors_loc = ensure_path(vectors_loc)
nlp = spacy.blank(lang)
for word in nlp.vocab:
word.rank = 0
lex_added = 0
with lexemes_loc.open() as file_:
for line in file_:
if line.strip():
attrs = json.loads(line)
if 'settings' in attrs:
nlp.vocab.cfg.update(attrs['settings'])
else:
lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs)
assert lex.rank == attrs['id']
lex_added += 1
if vectors_loc is not None:
vector_data = numpy.load(vectors_loc.open('rb'))
nlp.vocab.vectors = Vectors(data=vector_data)
for word in nlp.vocab:
if word.rank:
nlp.vocab.vectors.add(word.orth, row=word.rank)
if prune_vectors >= 1:
remap = nlp.vocab.prune_vectors(prune_vectors)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
vec_added = len(nlp.vocab.vectors)
prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
title="Sucessfully compiled vocab and vectors, and saved model")
return nlp

View File

@ -5,7 +5,6 @@ import os
import sys import sys
import ujson import ujson
import itertools import itertools
import locale
from thinc.neural.util import copy_array from thinc.neural.util import copy_array
@ -136,12 +135,3 @@ def import_file(name, loc):
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module) spec.loader.exec_module(module)
return module return module
def locale_escape(string, errors="replace"):
"""
Mangle non-supported characters, for savages with ascii terminals.
"""
encoding = locale.getpreferredencoding()
string = string.encode(encoding, errors).decode("utf8")
return string

View File

@ -5,7 +5,7 @@ from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..compat import b_to_str from ..compat import b_to_str
from ..errors import Errors, Warnings, user_warning from ..errors import Errors, Warnings, user_warning
from ..util import prints, is_in_jupyter from ..util import is_in_jupyter
_html = {} _html = {}
@ -72,14 +72,12 @@ def serve(
render(docs, style=style, page=page, minify=minify, options=options, manual=manual) render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server("0.0.0.0", port, app) httpd = simple_server.make_server("0.0.0.0", port, app)
prints( print("\nUsing the '{}' visualizer".format(style))
"Using the '{}' visualizer".format(style), print("Serving on port {}...\n".format(port))
title="Serving on port {}...".format(port),
)
try: try:
httpd.serve_forever() httpd.serve_forever()
except KeyboardInterrupt: except KeyboardInterrupt:
prints("Shutting down server on port {}.".format(port)) print("Shutting down server on port {}.".format(port))
finally: finally:
httpd.server_close() httpd.server_close()

View File

@ -278,6 +278,12 @@ class Errors(object):
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token" E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
" can only be part of one entity, so make sure the entities you're " " can only be part of one entity, so make sure the entities you're "
"setting don't overlap.") "setting don't overlap.")
E104 = ("Can't find JSON schema for '{name}'.")
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
"Doc.json() instead.")
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
"settings: {opts}")
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
@add_codes @add_codes

View File

@ -271,15 +271,26 @@ def _corrupt(c, noise_level):
return c.lower() return c.lower()
def read_json_file(loc, docs_filter=None, limit=None): def read_json_object(json_corpus_section):
loc = util.ensure_path(loc) """Take a list of JSON-formatted documents (e.g. from an already loaded
if loc.is_dir(): training data file) and yield tuples in the GoldParse format.
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit) json_corpus_section (list): The data.
else: YIELDS (tuple): The reformatted data.
for doc in _json_iterate(loc): """
if docs_filter is not None and not docs_filter(doc): for json_doc in json_corpus_section:
continue tuple_doc = json_to_tuple(json_doc)
for tuple_paragraph in tuple_doc:
yield tuple_paragraph
def json_to_tuple(doc):
"""Convert an item in the JSON-formatted training data to the tuple format
used by GoldParse.
doc (dict): One entry in the training data.
YIELDS (tuple): The reformatted data.
"""
paragraphs = [] paragraphs = []
for paragraph in doc['paragraphs']: for paragraph in doc['paragraphs']:
sents = [] sents = []
@ -307,6 +318,19 @@ def read_json_file(loc, docs_filter=None, limit=None):
yield [paragraph.get('raw', None), sents] yield [paragraph.get('raw', None), sents]
def read_json_file(loc, docs_filter=None, limit=None):
loc = util.ensure_path(loc)
if loc.is_dir():
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit)
else:
for doc in _json_iterate(loc):
if docs_filter is not None and not docs_filter(doc):
continue
for json_tuple in json_to_tuple(doc):
yield json_tuple
def _json_iterate(loc): def _json_iterate(loc):
# We should've made these files jsonl...But since we didn't, parse out # We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage. # the docs one-by-one to reduce memory usage.
@ -573,32 +597,19 @@ cdef class GoldParse:
self.c.sent_start[i] = 0 self.c.sent_start[i] = 0
def docs_to_json(id, docs): def docs_to_json(docs, underscore=None):
'''Convert a list of Doc objects into the JSON-serializable format used by """Convert a list of Doc objects into the JSON-serializable format used by
the spacy train command. Each Doc in the list will be interpreted as a the spacy train command.
paragraph.
''' docs (iterable / Doc): The Doc object(s) to convert.
underscore (list): Optional list of string names of custom doc._.
attributes. Attribute values need to be JSON-serializable. Values will
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
RETURNS (list): The data in spaCy's JSON format.
"""
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
json_doc = {'id': id, 'paragraphs': []} return [doc.to_json(underscore=underscore) for doc in docs]
for i, doc in enumerate(docs):
json_para = {'raw': doc.text, 'sentences': []}
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
for j, sent in enumerate(doc.sents):
json_sent = {'tokens': [], 'brackets': []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
if doc.is_tagged:
json_token['tag'] = token.tag_
if doc.is_parsed:
json_token['head'] = token.head.i-token.i
json_token['dep'] = token.dep_
json_token['ner'] = biluo_tags[token.i]
json_sent['tokens'].append(json_token)
json_para['sentences'].append(json_sent)
json_doc['paragraphs'].append(json_para)
return json_doc
def biluo_tags_from_offsets(doc, entities, missing='O'): def biluo_tags_from_offsets(doc, entities, missing='O'):

View File

@ -341,21 +341,3 @@ def test_lowest_common_ancestor(en_tokenizer):
assert lca[1, 1] == 1 assert lca[1, 1] == 1
assert lca[0, 1] == 2 assert lca[0, 1] == 2
assert lca[1, 2] == 2 assert lca[1, 2] == 2
def test_parse_tree(en_tokenizer):
"""Tests doc.print_tree() method."""
text = "I like New York in Autumn."
heads = [1, 0, 1, -2, -3, -1, -5]
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
# full method parse_tree(text) is a trivial composition
trees = doc.print_tree()
assert len(trees) > 0
tree = trees[0]
assert all(
k in list(tree.keys())
for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
)
assert tree["word"] == "like" # check root is correct

View File

@ -0,0 +1,65 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.cli.schemas import get_schema, validate_json
from spacy.tokens import Doc
from ..util import get_doc
@pytest.fixture()
def doc(en_vocab):
words = ["c", "d", "e"]
pos = ["VERB", "NOUN", "NOUN"]
tags = ["VBP", "NN", "NN"]
heads = [0, -1, -2]
deps = ["ROOT", "dobj", "dobj"]
ents = [(1, 2, "ORG")]
return get_doc(
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
)
def test_doc_to_json(doc):
json_doc = doc.to_json()
assert json_doc["text"] == "c d e "
assert len(json_doc["tokens"]) == 3
assert json_doc["tokens"][0]["pos"] == "VERB"
assert json_doc["tokens"][0]["tag"] == "VBP"
assert json_doc["tokens"][0]["dep"] == "ROOT"
assert len(json_doc["ents"]) == 1
assert json_doc["ents"][0]["start"] == 2 # character offset!
assert json_doc["ents"][0]["end"] == 3 # character offset!
assert json_doc["ents"][0]["label"] == "ORG"
def test_doc_to_json_underscore(doc):
Doc.set_extension("json_test1", default=False)
Doc.set_extension("json_test2", default=False)
doc._.json_test1 = "hello world"
doc._.json_test2 = [1, 2, 3]
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
assert "_" in json_doc
assert json_doc["_"]["json_test1"] == "hello world"
assert json_doc["_"]["json_test2"] == [1, 2, 3]
def test_doc_to_json_underscore_error_attr(doc):
"""Test that Doc.to_json() raises an error if a custom attribute doesn't
exist in the ._ space."""
with pytest.raises(ValueError):
doc.to_json(underscore=["json_test3"])
def test_doc_to_json_underscore_error_serialize(doc):
"""Test that Doc.to_json() raises an error if a custom attribute value
isn't JSON-serializable."""
Doc.set_extension("json_test4", method=lambda doc: doc.text)
with pytest.raises(ValueError):
doc.to_json(underscore=["json_test4"])
def test_doc_to_json_valid_training(doc):
json_doc = doc.to_json()
errors = validate_json([json_doc], get_schema("training"))
assert not errors

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
from spacy.matcher import PhraseMatcher from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc from spacy.tokens import Doc
from ..util import get_doc from ..util import get_doc

View File

@ -2,9 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import docs_to_json
from spacy.tokens import Doc from spacy.tokens import Doc
from .util import get_doc
def test_gold_biluo_U(en_vocab): def test_gold_biluo_U(en_vocab):
@ -52,34 +50,3 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
assert biluo_tags_converted == biluo_tags assert biluo_tags_converted == biluo_tags
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
assert offsets_converted == offsets assert offsets_converted == offsets
def test_docs_to_json(en_vocab):
"""Test we can convert a list of Doc objects into the JSON-serializable
format we use for training.
"""
docs = [
get_doc(
en_vocab,
words=["a", "b"],
pos=["VBP", "NN"],
heads=[0, -1],
deps=["ROOT", "dobj"],
ents=[],
),
get_doc(
en_vocab,
words=["c", "d", "e"],
pos=["VBP", "NN", "NN"],
heads=[0, -1, -2],
deps=["ROOT", "dobj", "dobj"],
ents=[(1, 2, "ORG")],
),
]
json_doc = docs_to_json(0, docs)
assert json_doc["id"] == 0
assert len(json_doc["paragraphs"]) == 2
assert len(json_doc["paragraphs"][0]["sentences"]) == 1
assert len(json_doc["paragraphs"][1]["sentences"]) == 1
assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3

View File

@ -0,0 +1,44 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.cli.schemas import validate_json, get_schema
import pytest
@pytest.fixture(scope="session")
def training_schema():
return get_schema("training")
def test_json_schema_get():
schema = get_schema("training")
assert schema
with pytest.raises(ValueError):
schema = get_schema("xxx")
@pytest.mark.parametrize(
"data",
[
{"text": "Hello world"},
{"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "TEST"}]},
],
)
def test_json_schema_training_valid(data, training_schema):
errors = validate_json([data], training_schema)
assert not errors
@pytest.mark.parametrize(
"data,n_errors",
[
({"spans": []}, 1),
({"text": "Hello", "ents": [{"start": "0", "end": "5", "label": "TEST"}]}, 2),
({"text": "Hello", "ents": [{"start": 0, "end": 5}]}, 1),
({"text": "Hello", "ents": [{"start": 0, "end": 5, "label": "test"}]}, 1),
({"text": "spaCy", "tokens": [{"pos": "PROPN"}]}, 2),
],
)
def test_json_schema_training_invalid(data, n_errors, training_schema):
errors = validate_json([data], training_schema)
assert len(errors) == n_errors

View File

@ -1,7 +1,6 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from pathlib import Path from pathlib import Path
from spacy import util from spacy import util

View File

@ -20,7 +20,6 @@ from .span cimport Span
from .token cimport Token from .token cimport Token
from .span cimport Span from .span cimport Span
from .token cimport Token from .token cimport Token
from .printers import parse_tree
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs, IDS from ..attrs import intify_attrs, IDS
@ -29,7 +28,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB
from ..attrs cimport ENT_TYPE, SENT_START from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice, is_json_serializable
from ..compat import is_config, copy_reg, pickle, basestring_ from ..compat import is_config, copy_reg, pickle, basestring_
from ..errors import deprecation_warning, models_warning, user_warning from ..errors import deprecation_warning, models_warning, user_warning
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
@ -959,31 +958,48 @@ cdef class Doc:
return self[start] return self[start]
def print_tree(self, light=False, flat=False): def print_tree(self, light=False, flat=False):
"""Returns the parse trees in JSON (dict) format. raise ValueError(Errors.E105)
light (bool): Don't include lemmas or entities. def to_json(self, underscore=None):
flat (bool): Don't include arcs or modifiers. """Convert a Doc to JSON. Produces the same format used by the spacy
RETURNS (dict): Parse tree as dict. train command.
EXAMPLE: underscore (list): Optional list of string names of custom doc._.
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') attributes. Attribute values need to be JSON-serializable. Values will
>>> trees = doc.print_tree() be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
>>> trees[1] RETURNS (dict): The data in spaCy's JSON format.
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice',
'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP',
'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
""" """
return parse_tree(self, light=light, flat=flat) data = {'text': self.text}
data['ents'] = [{'start': ent.start_char, 'end': ent.end_char,
'label': ent.label_} for ent in self.ents]
sents = list(self.sents)
if sents:
data['sents'] = [{'start': sent.start_char, 'end': sent.end_char}
for sent in sents]
if self.cats:
data['cats'] = self.cats
data['tokens'] = []
for token in self:
token_data = {'id': token.i, 'start': token.idx, 'end': token.idx + len(token)}
if token.pos_:
token_data['pos'] = token.pos_
if token.tag_:
token_data['tag'] = token.tag_
if token.dep_:
token_data['dep'] = token.dep_
if token.head:
token_data['head'] = token.head.i
data['tokens'].append(token_data)
if underscore:
data['_'] = {}
for attr in underscore:
if not self.has_extension(attr):
raise ValueError(Errors.E106.format(attr=attr, opts=underscore))
value = self._.get(attr)
if not is_json_serializable(value):
raise ValueError(Errors.E107.format(attr=attr, value=repr(value)))
data['_'][attr] = value
return data
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:

View File

@ -1,74 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from .doc import Doc
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
def merge_ents(doc):
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
for ent in doc.ents:
ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_)
return doc
def format_POS(token, light, flat):
"""Helper: form the POS output for a token."""
subtree = dict([
("word", token.text),
("lemma", token.lemma_), # trigger
("NE", token.ent_type_), # trigger
("POS_fine", token.tag_),
("POS_coarse", token.pos_),
("arc", token.dep_),
("modifiers", [])
])
if light:
subtree.pop("lemma")
subtree.pop("NE")
if flat:
subtree.pop("arc")
subtree.pop("modifiers")
return subtree
def POS_tree(root, light=False, flat=False):
"""Helper: generate a POS tree for a root token. The doc must have
`merge_ents(doc)` ran on it.
"""
subtree = format_POS(root, light=light, flat=flat)
for c in root.children:
subtree["modifiers"].append(POS_tree(c))
return subtree
def parse_tree(doc, light=False, flat=False):
"""Make a copy of the doc and construct a syntactic parse tree similar to
displaCy. Generates the POS tree for all sentences in a doc.
doc (Doc): The doc for parsing.
RETURNS (dict): The parse tree.
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
merge_ents(doc_clone) # merge the entities into single tokens first
return [POS_tree(sent.root, light=light, flat=flat)
for sent in doc_clone.sents]

View File

@ -7,8 +7,6 @@ import pkg_resources
import importlib import importlib
import regex as re import regex as re
from pathlib import Path from pathlib import Path
import sys
import textwrap
import random import random
from collections import OrderedDict from collections import OrderedDict
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
@ -18,9 +16,10 @@ import cytoolz
import itertools import itertools
import numpy.random import numpy.random
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import import_file from .compat import import_file, json_dumps
from .errors import Errors from .errors import Errors
# Import these directly from Thinc, so that we're sure we always have the # Import these directly from Thinc, so that we're sure we always have the
@ -541,6 +540,16 @@ def read_json(location):
return ujson.load(f) return ujson.load(f)
def write_json(file_path, contents):
"""Create a .json file and dump contents.
file_path (unicode / Path): The path to the output file.
contents: The JSON-serializable contents to output.
"""
with Path(file_path).open("w", encoding="utf8") as f:
f.write(json_dumps(contents))
def read_jsonl(file_path): def read_jsonl(file_path):
"""Read a .jsonl file and yield its contents line by line. """Read a .jsonl file and yield its contents line by line.
@ -555,6 +564,29 @@ def read_jsonl(file_path):
continue continue
def write_jsonl(file_path, lines):
"""Create a .jsonl file and dump contents.
file_path (unicode / Path): The path to the output file.
lines (list): The JSON-serializable contents of each line.
"""
data = [json_dumps(line) for line in lines]
with Path(file_path).open("w", encoding="utf-8") as f:
f.write("\n".join(data))
def is_json_serializable(obj):
"""Check if a Python object is JSON-serializable."""
if hasattr(obj, "__call__"):
# Check this separately here to prevent infinite recursions
return False
try:
ujson.dumps(obj)
return True
except TypeError:
return False
def get_raw_input(description, default=False): def get_raw_input(description, default=False):
"""Get user input from the command line via raw_input / input. """Get user input from the command line via raw_input / input.
@ -602,21 +634,6 @@ def from_disk(path, readers, exclude):
return path return path
def print_table(data, title=None):
"""Print data in table format.
data (dict or list of tuples): Label/value pairs.
title (unicode or None): Title, will be printed above.
"""
if isinstance(data, dict):
data = list(data.items())
tpl_row = " {:<15}" * len(data[0])
table = "\n".join([tpl_row.format(l, unicode_(v)) for l, v in data])
if title:
print("\n \033[93m{}\033[0m".format(title))
print("\n{}\n".format(table))
def print_markdown(data, title=None): def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc. """Print data in GitHub-flavoured Markdown format for issues etc.
@ -638,44 +655,6 @@ def print_markdown(data, title=None):
print("\n{}\n".format("\n".join(markdown))) print("\n{}\n".format("\n".join(markdown)))
def prints(*texts, **kwargs):
"""Print formatted message (manual ANSI escape sequences to avoid
dependency)
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
**kwargs: 'title' becomes coloured headline. exits=True performs sys exit.
"""
exits = kwargs.get("exits", None)
title = kwargs.get("title", None)
title = "\033[93m{}\033[0m\n".format(_wrap(title)) if title else ""
message = "\n\n".join([_wrap(text) for text in texts])
print("\n{}{}\n".format(title, message))
if exits is not None:
sys.exit(exits)
def _wrap(text, wrap_max=80, indent=4):
"""Wrap text at given width using textwrap module.
text (unicode): Text to wrap. If it's a Path, it's converted to string.
wrap_max (int): Maximum line length (indent is deducted).
indent (int): Number of spaces for indentation.
RETURNS (unicode): Wrapped text.
"""
indent = indent * " "
wrap_width = wrap_max - len(indent)
if isinstance(text, Path):
text = path2str(text)
return textwrap.fill(
text,
width=wrap_width,
initial_indent=indent,
subsequent_indent=indent,
break_long_words=False,
break_on_hyphens=False,
)
def minify_html(html): def minify_html(html):
"""Perform a template-specific, rudimentary HTML minification for displaCy. """Perform a template-specific, rudimentary HTML minification for displaCy.
Disclaimer: NOT a general-purpose solution, only removes indentation and Disclaimer: NOT a general-purpose solution, only removes indentation and

View File

@ -320,37 +320,6 @@ p
+cell dict +cell dict
+cell Combined tokenizer exceptions. +cell Combined tokenizer exceptions.
+h(3, "util.prints") util.prints
+tag function
+tag-new(2)
p
| Print a formatted, text-wrapped message with optional title. If a text
| argument is a #[code Path], it's converted to a string. Should only
| be used for interactive components like the command-line interface.
+aside-code("Example").
data_path = Path('/some/path')
if not path.exists():
util.prints("Can't find the path.", data_path,
title="Error", exits=1)
+table(["Name", "Type", "Description"])
+row
+cell #[code *texts]
+cell unicode
+cell Texts to print. Each argument is rendered as paragraph.
+row
+cell #[code **kwargs]
+cell -
+cell
| #[code title] is rendered as coloured headline. #[code exits]
| performs system exit after printing, using the value of the
| argument as the exit code, e.g. #[code exits=1].
+h(3, "util.minibatch") util.minibatch +h(3, "util.minibatch") util.minibatch
+tag function +tag function
+tag-new(2) +tag-new(2)

View File

@ -257,10 +257,19 @@ p
| to allow packaging the model using the | to allow packaging the model using the
| #[+api("cli#package") #[code package]] command. | #[+api("cli#package") #[code package]] command.
+infobox("Changed in v2.1", "⚠️")
| As of spaCy 2.1, the #[code --no-tagger], #[code --no-parser] and
| #[code --no-parser] flags have been replaced by a #[code --pipeline]
| option, which lets you define comma-separated names of pipeline
| components to train. For example, #[code --pipeline tagger,parser] will
| only train the tagger and parser.
+code(false, "bash", "$", false, false, true). +code(false, "bash", "$", false, false, true).
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] python -m spacy train [lang] [output_path] [train_path] [dev_path]
[--n-sents] [--use-gpu] [--meta-path] [--vectors] [--no-tagger] [--no-parser] [--base-model] [--pipeline] [--vectors] [--n-iter] [--n-examples] [--use-gpu]
[--no-entities] [--gold-preproc] [--verbose] [--version] [--meta-path] [--init-tok2vec] [--parser-multitasks]
[--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens]
[--verbose]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -269,34 +278,34 @@ p
+cell Model language. +cell Model language.
+row +row
+cell #[code output_dir] +cell #[code output_path]
+cell positional +cell positional
+cell Directory to store model in. +cell Directory to store model in. Will be created if it doesn't exist.
+row +row
+cell #[code train_data] +cell #[code train_path]
+cell positional +cell positional
+cell Location of JSON-formatted training data. +cell Location of JSON-formatted training data.
+row +row
+cell #[code dev_data] +cell #[code dev_path]
+cell positional +cell positional
+cell Location of JSON-formatted development data for evaluation. +cell Location of JSON-formatted development data for evaluation.
+row +row
+cell #[code --n-iter], #[code -n] +cell #[code --base-model], #[code -b]
+cell option +cell option
+cell Number of iterations (default: #[code 30]). +cell
| Optional name of base model to update. Can be any loadable
| spaCy model.
+row +row
+cell #[code --n-sents], #[code -ns] +cell #[code --pipeline], #[code -p]
+tag-new("2.1.0")
+cell option +cell option
+cell Number of sentences (default: #[code 0]). +cell
| Comma-separated names of pipeline components to train. Defaults
+row | to #[code 'tagger,parser,ner'].
+cell #[code --use-gpu], #[code -g]
+cell option
+cell Use GPU.
+row +row
+cell #[code --vectors], #[code -v] +cell #[code --vectors], #[code -v]
@ -304,13 +313,21 @@ p
+cell Model to load vectors from. +cell Model to load vectors from.
+row +row
+cell #[code --meta-path], #[code -m] +cell #[code --n-iter], #[code -n]
+cell option
+cell Number of iterations (default: #[code 30]).
+row
+cell #[code --n-examples], #[code -ns]
+cell option
+cell Number of examples to use (defaults to #[code 0] for all examples).
+row
+cell #[code --use-gpu], #[code -g]
+cell option +cell option
+cell +cell
| #[+tag-new(2)] Optional path to model | Whether to use GPU. Can be either #[code 0], #[code 1] or
| #[+a("/usage/training#models-generating") #[code meta.json]]. | #[code -1].
| All relevant properties like #[code lang], #[code pipeline] and
| #[code spacy_version] will be overwritten.
+row +row
+cell #[code --version], #[code -V] +cell #[code --version], #[code -V]
@ -320,40 +337,69 @@ p
| #[code meta.json] after training. | #[code meta.json] after training.
+row +row
+cell #[code --no-tagger], #[code -T] +cell #[code --meta-path], #[code -m]
+cell flag +tag-new(2)
+cell Don't train tagger. +cell option
+cell
| Optional path to model
| #[+a("/usage/training#models-generating") #[code meta.json]].
| All relevant properties like #[code lang], #[code pipeline] and
| #[code spacy_version] will be overwritten.
+row +row
+cell #[code --no-parser], #[code -P] +cell #[code --init-tok2vec], #[code -t2v]
+cell flag +tag-new("2.1.0")
+cell Don't train parser. +cell option
+cell
| Path to pretrained weights for the token-to-vector parts of the
| models. See #[code spacy pretrain]. Experimental.
+row +row
+cell #[code --no-entities], #[code -N] +cell #[code --parser-multitasks], #[code -pt]
+cell flag +cell option
+cell Don't train NER. +cell
| Side objectives for parser CNN, e.g. #[code 'dep'] or
| #[code 'dep,tag']
+row
+cell #[code --entity-multitasks], #[code -et]
+cell option
+cell
| Side objectives for NER CNN, e.g. #[code 'dep'] or
| #[code 'dep,tag']
+row
+cell #[code --noise-level], #[code -nl]
+cell option
+cell Float indicating the amount of corruption for data agumentation.
+row +row
+cell #[code --gold-preproc], #[code -G] +cell #[code --gold-preproc], #[code -G]
+cell flag +cell flag
+cell Use gold preprocessing. +cell Use gold preprocessing.
+row
+cell #[code --learn-tokens], #[code -T]
+cell flag
+cell
| Make parser learn gold-standard tokenization by merging
] subtokens. Typically used for languages like Chinese.
+row
+cell #[code --verbose], #[code -VV]
+tag-new("2.0.13")
+cell flag
+cell Show more detailed messages during training.
+row +row
+cell #[code --help], #[code -h] +cell #[code --help], #[code -h]
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+row
+cell #[code --verbose]
+tag-new("2.0.13")
+cell flag
+cell Show more detail message during training.
+row("foot") +row("foot")
+cell creates +cell creates
+cell model, pickle +cell model, pickle
+cell A spaCy model on each epoch, and a final #[code .pickle] file. +cell A spaCy model on each epoch.
+h(4, "train-hyperparams") Environment variables for hyperparameters +h(4, "train-hyperparams") Environment variables for hyperparameters
+tag-new(2) +tag-new(2)