Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-22 05:39:18 -05:00
commit 70a8c531cd
17 changed files with 206 additions and 348 deletions

View File

@ -3,127 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!! # NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals #from __future__ import unicode_literals
import plac
from spacy.cli import download as cli_download
from spacy.cli import link as cli_link
from spacy.cli import info as cli_info
from spacy.cli import package as cli_package
from spacy.cli import train as cli_train
from spacy.cli import model as cli_model
from spacy.cli import convert as cli_convert
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
cli_download(model, direct)
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
cli_link(origin, link_name, force)
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False):
"""
Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
cli_info(model, markdown)
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
cli_package(input_dir, output_dir, meta, force)
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(input_file, output_dir, n_sents=10, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
cli_convert(input_file, output_dir, n_sents, morphology)
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
nsents=("number of sentences", "option", None, int),
use_gpu=("Use GPU", "flag", "g", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(lang, output_dir, train_data, dev_data=None, n_iter=15,
nsents=0, use_gpu=False,
no_tagger=False, no_parser=False, no_entities=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
nsents = nsents or None
cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
use_gpu, no_tagger, no_parser, no_entities)
if __name__ == '__main__': if __name__ == '__main__':
import plac import plac
import sys import sys
commands = { from spacy.cli import download, link, info, package, train, convert
'train': train, from spacy.util import prints
'convert': convert,
'download': download, commands = {'download': download, 'link': link, 'info': info, 'train': train,
'link': link, 'convert': convert, 'package': package}
'info': info,
'package': package,
}
if len(sys.argv) == 1: if len(sys.argv) == 1:
print("Available commands: %s" % ', '.join(sorted(commands))) prints(', '.join(commands), title="Available commands", exits=1)
sys.exit(1)
command = sys.argv.pop(1) command = sys.argv.pop(1)
sys.argv[0] = 'spacy %s' % command sys.argv[0] = 'spacy %s' % command
if command in commands: if command in commands:
plac.call(commands[command]) plac.call(commands[command])
else: else:
print("Unknown command: %s. Available: %s" % (command, ', '.join(commands))) prints("Available: %s" % ', '.join(commands),
sys.exit(1) title="Unknown command: %s" % command, exits=1)

View File

@ -3,5 +3,4 @@ from .info import info
from .link import link from .link import link
from .package import package from .package import package
from .train import train from .train import train
from .model import model
from .convert import convert from .convert import convert

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
from pathlib import Path from pathlib import Path
from .converters import conllu2json, iob2json from .converters import conllu2json, iob2json
@ -18,15 +19,24 @@ CONVERTERS = {
} }
def convert(input_file, output_dir, *args): @plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file) input_path = Path(input_file)
output_path = Path(output_dir) output_path = Path(output_dir)
if not input_path.exists(): if not input_path.exists():
prints(input_path, title="Input file not found", exits=True) prints(input_path, title="Input file not found", exits=1)
if not output_path.exists(): if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True) prints(output_path, title="Output directory not found", exits=1)
file_ext = input_path.suffix file_ext = input_path.suffix
if not file_ext in CONVERTERS: if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1], prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=True) title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path, *args) CONVERTERS[file_ext](input_path, output_path, *args)

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
import requests import requests
import os import os
import subprocess import subprocess
@ -11,7 +12,16 @@ from ..util import prints
from .. import about from .. import about
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False): def download(model, direct=False):
"""Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
if direct: if direct:
download_model('{m}/{m}.tar.gz'.format(m=model)) download_model('{m}/{m}.tar.gz'.format(m=model))
else: else:
@ -38,7 +48,7 @@ def get_json(url, desc):
if r.status_code != 200: if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation " prints("Couldn't fetch %s. Please find a model for your spaCy installation "
"(v%s), and download it manually." % (desc, about.__version__), "(v%s), and download it manually." % (desc, about.__version__),
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True) about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
return r.json() return r.json()
@ -48,7 +58,7 @@ def get_compatibility():
comp = comp_table['spacy'] comp = comp_table['spacy']
if version not in comp: if version not in comp:
prints("No compatible models found for v%s of spaCy." % version, prints("No compatible models found for v%s of spaCy." % version,
title="Compatibility error", exits=True) title="Compatibility error", exits=1)
return comp[version] return comp[version]
@ -56,7 +66,7 @@ def get_version(model, comp):
if model not in comp: if model not in comp:
version = about.__version__ version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
title="Compatibility error", exits=True) title="Compatibility error", exits=1)
return comp[model][0] return comp[model][0]

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
import platform import platform
from pathlib import Path from pathlib import Path
@ -9,7 +10,15 @@ from .. import about
from .. import util from .. import util
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False): def info(model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model: if model:
model_path = util.resolve_model_path(model) model_path = util.resolve_model_path(model)
meta = util.parse_package_meta(model_path) meta = util.parse_package_meta(model_path)

View File

@ -1,24 +1,35 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
from pathlib import Path from pathlib import Path
from ..compat import symlink_to, path2str from ..compat import symlink_to, path2str
from ..util import prints from ..util import prints
from .. import util from .. import util
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False): def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin): if util.is_package(origin):
model_path = util.get_model_package_path(origin) model_path = util.get_model_package_path(origin)
else: else:
model_path = Path(origin) model_path = Path(origin)
if not model_path.exists(): if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path), prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=True) title="Can't locate model data", exits=1)
link_path = util.get_data_path() / link_name link_path = util.get_data_path() / link_name
if link_path.exists() and not force: if link_path.exists() and not force:
prints("To overwrite an existing link, use the --force flag.", prints("To overwrite an existing link, use the --force flag.",
title="Link %s already exists" % link_name, exits=True) title="Link %s already exists" % link_name, exits=1)
elif link_path.exists(): elif link_path.exists():
link_path.unlink() link_path.unlink()
try: try:

View File

@ -1,122 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from preshed.counter import PreshCounter
from ..vocab import write_binary_vectors
from ..compat import fix_text, path2str
from ..util import prints
from .. import util
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = util.ensure_path(model_dir)
freqs_path = util.ensure_path(freqs_data)
clusters_path = util.ensure_path(clusters_data)
vectors_path = util.ensure_path(vectors_data)
if not freqs_path.is_file():
prints(freqs_path, title="No frequencies file found", exits=True)
if clusters_path and not clusters_path.is_file():
prints(clusters_path, title="No Brown clusters file found", exits=True)
if vectors_path and not vectors_path.is_file():
prints(vectors_path, title="No word vectors file found", exits=True)
vocab = util.get_lang_class(lang).Defaults.create_vocab()
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, vectors_path, vocab, oov_prob)
def create_model(model_path, vectors_path, vocab, oov_prob):
vocab_path = model_path / 'vocab'
lexemes_path = vocab_path / 'lexemes.bin'
strings_path = vocab_path / 'strings.json'
oov_path = vocab_path / 'oov_prob'
if not model_path.exists():
model_path.mkdir()
if not vocab_path.exists():
vocab_path.mkdir()
vocab.dump(path2str(lexemes_path))
with strings_path.open('w') as f:
vocab.strings.dump(f)
with oov_path.open('w') as f:
f.write('%f' % oov_prob)
if vectors_path:
vectors_dest = vocab_path / 'vec.bin'
write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = path2str(file_path)
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import plac
import shutil import shutil
import requests import requests
from pathlib import Path from pathlib import Path
@ -11,16 +12,26 @@ from .. import util
from .. import about from .. import about
def package(input_dir, output_dir, meta_path, force): @plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta, force):
"""Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
input_path = util.ensure_path(input_dir) input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta)
if not input_path or not input_path.exists(): if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=True) prints(input_path, title="Model directory not found", exits=1)
if not output_path or not output_path.exists(): if not output_path or not output_path.exists():
prints(output_path, title="Output directory not found", exits=True) prints(output_path, title="Output directory not found", exits=1)
if meta_path and not meta_path.exists(): if meta_path and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=True) prints(meta_path, title="meta.json not found", exits=1)
template_setup = get_template('setup.py') template_setup = get_template('setup.py')
template_manifest = get_template('MANIFEST.in') template_manifest = get_template('MANIFEST.in')
@ -55,7 +66,7 @@ def create_dirs(package_path, force):
else: else:
prints(package_path, "Please delete the directory and try again, or " prints(package_path, "Please delete the directory and try again, or "
"use the --force flag to overwrite existing directories.", "use the --force flag to overwrite existing directories.",
title="Package directory already exists", exits=True) title="Package directory already exists", exits=1)
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
@ -87,12 +98,12 @@ def validate_meta(meta, keys):
for key in keys: for key in keys:
if key not in meta or meta[key] == '': if key not in meta or meta[key] == '':
prints("This setting is required to build your package.", prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=True) title='No "%s" setting found in meta.json' % key, exits=1)
def get_template(filepath): def get_template(filepath):
r = requests.get(about.__model_files__ + filepath) r = requests.get(about.__model_files__ + filepath)
if r.status_code != 200: if r.status_code != 200:
prints("Couldn't fetch template files from GitHub.", prints("Couldn't fetch template files from GitHub.",
title="Server error (%d)" % r.status_code, exits=True) title="Server error (%d)" % r.status_code, exits=1)
return r.text return r.text

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, division, print_function from __future__ import unicode_literals, division, print_function
import plac
import json import json
from collections import defaultdict from collections import defaultdict
import cytoolz import cytoolz
@ -18,19 +19,33 @@ from .. import util
from .. import displacy from .. import displacy
def train(lang_id, output_dir, train_data, dev_data, n_iter, n_sents, @plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "flag", "G", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(lang, output_dir, train_data, dev_data, n_iter, n_sents,
use_gpu, no_tagger, no_parser, no_entities): use_gpu, no_tagger, no_parser, no_entities):
"""Train a model. Expects data in spaCy's JSON format."""
n_sents = n_sents or None
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data) train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data) dev_path = util.ensure_path(dev_data)
if not output_path.exists(): if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True) prints(output_path, title="Output directory not found", exits=1)
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=True) prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=True) prints(dev_path, title="Development data not found", exits=1)
lang_class = util.get_lang_class(lang_id) lang_class = util.get_lang_class(lang)
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags') if no_tagger and 'tags' in pipeline: pipeline.remove('tags')

View File

@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words):
class GoldCorpus(object): class GoldCorpus(object):
'''An annotated corpus, using the JSON file format. Manages """An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing, NER.''' annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path): def __init__(self, train_path, dev_path):
"""Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
"""
self.train_path = util.ensure_path(train_path) self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path) self.dev_path = util.ensure_path(dev_path)
self.train_locs = self.walk_corpus(self.train_path) self.train_locs = self.walk_corpus(self.train_path)

View File

@ -236,6 +236,12 @@ class Language(object):
doc.tensor = None doc.tensor = None
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline: for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'): if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds) docs_golds = proc.preprocess_gold(docs_golds)

View File

@ -380,13 +380,13 @@ def prints(*texts, **kwargs):
*texts (unicode): Texts to print. Each argument is rendered as paragraph. *texts (unicode): Texts to print. Each argument is rendered as paragraph.
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit. **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
""" """
exits = kwargs.get('exits', False) exits = kwargs.get('exits', None)
title = kwargs.get('title', None) title = kwargs.get('title', None)
title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else '' title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
message = '\n\n'.join([_wrap(text) for text in texts]) message = '\n\n'.join([_wrap(text) for text in texts])
print('\n{}{}\n'.format(title, message)) print('\n{}{}\n'.format(title, message))
if exits: if exits is not None:
sys.exit(0) sys.exit(exits)
def _wrap(text, wrap_max=80, indent=4): def _wrap(text, wrap_max=80, indent=4):

View File

@ -23,7 +23,8 @@
"Lexeme": "lexeme", "Lexeme": "lexeme",
"Vocab": "vocab", "Vocab": "vocab",
"StringStore": "stringstore", "StringStore": "stringstore",
"GoldParse": "goldparse" "GoldParse": "goldparse",
"GoldCorpus": "goldcorpus"
}, },
"Other": { "Other": {
"Annotation Specs": "annotation", "Annotation Specs": "annotation",
@ -135,6 +136,11 @@
"tag": "class" "tag": "class"
}, },
"goldcorpus": {
"title": "GoldCorpus",
"tag": "class"
},
"annotation": { "annotation": {
"title": "Annotation Specifications" "title": "Annotation Specifications"
}, },

View File

@ -5,16 +5,23 @@ include ../../_includes/_mixins
p p
| As of v1.7.0, spaCy comes with new command line helpers to download and | As of v1.7.0, spaCy comes with new command line helpers to download and
| link models and show useful debugging information. For a list of available | link models and show useful debugging information. For a list of available
| commands, type #[code python -m spacy --help]. | commands, type #[code python -m spacy]. To make the command even more
| convenient, we recommend
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
| mapping #[code python -m spacy] to #[code spacy].
+aside("Why python -m?") +aside("Why python -m?")
| The problem with a global entry point is that it's resolved by looking up | The problem with a global entry point is that it's resolved by looking up
| entries in your #[code PATH] environment variable. This can give you | entries in your #[code PATH] environment variable. This can give you
| unexpected results, like executing the wrong spaCy installation | unexpected results, like executing the wrong spaCy installation.
| (especially when using #[code virtualenv]). #[code python -m] prevents | #[code python -m] prevents fallbacks to system modules.
| fallbacks to system modules and makes sure the correct spaCy version is
| used. If you hate typing it every time, we recommend creating an +infobox("⚠️ Deprecation note")
| #[code alias] instead. | As of spaCy 2.0, the #[code model] command to initialise a model data
| directory is deprecated. The command was only necessary because previous
| versions of spaCy expected a model directory to already be set up. This
| has since been changed, so you can use the #[+api("cli#train") #[code train]]
| command straight away.
+h(2, "download") Download +h(2, "download") Download
@ -45,7 +52,7 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+infobox("Important note") +aside("Downloading best practices")
| The #[code download] command is mostly intended as a convenient, | The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints | interactive wrapper it performs compatibility checks and prints
| detailed messages in case things go wrong. It's #[strong not recommended] | detailed messages in case things go wrong. It's #[strong not recommended]
@ -116,7 +123,6 @@ p
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "convert") Convert +h(2, "convert") Convert
+tag experimental
p p
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] | Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
@ -153,49 +159,7 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "model") Model
+tag experimental
p
| Initialise a new model and its data directory. For more info on this, see
| the documentation on #[+a("/docs/usage/adding-languages") adding languages].
+code(false, "bash").
python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell Model language.
+row
+cell #[code model_dir]
+cell positional
+cell Output directory to store the model in.
+row
+cell #[code freqs_data]
+cell positional
+cell Tab-separated frequencies file.
+row
+cell #[code clusters_data]
+cell positional
+cell Brown custers file (optional).
+row
+cell #[code vectors_data]
+cell positional
+cell Word vectors file (optional).
+row
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.
+h(2, "train") Train +h(2, "train") Train
+tag experimental
p p
| Train a model. Expects data in spaCy's | Train a model. Expects data in spaCy's
@ -231,7 +195,7 @@ p
+cell Number of iterations (default: #[code 15]). +cell Number of iterations (default: #[code 15]).
+row +row
+cell #[code --nsents] +cell #[code --n_sents], #[code -ns]
+cell option +cell option
+cell Number of sentences (default: #[code 0]). +cell Number of sentences (default: #[code 0]).
@ -241,7 +205,7 @@ p
+cell L1 regularization penalty for parser (default: #[code 0.0]). +cell L1 regularization penalty for parser (default: #[code 0.0]).
+row +row
+cell #[code --use-gpu], #[code -g] +cell #[code --use-gpu], #[code -G]
+cell flag +cell flag
+cell Use GPU. +cell Use GPU.
@ -266,17 +230,16 @@ p
+cell Show help message and available arguments. +cell Show help message and available arguments.
+h(2, "package") Package +h(2, "package") Package
+tag experimental
p p
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
| from an existing model data directory. All data files are copied over. | from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the | If the path to a meta.json is supplied, or a meta.json is found in the
| input directory, this file is used. Otherwise, the data can be entered | input directory, this file is used. Otherwise, the data can be entered
| directly from the command line. While this feature is still experimental, | directly from the command line. The required file templates are downloaded
| the required file templates are downloaded from | from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means | sure you're always using the latest versions. This means you need to be
| you need to be connected to the internet to use this command. | connected to the internet to use this command.
+code(false, "bash"). +code(false, "bash").
python -m spacy package [input_dir] [output_dir] [--meta] [--force] python -m spacy package [input_dir] [output_dir] [--meta] [--force]

View File

@ -0,0 +1,23 @@
//- 💫 DOCS > API > GOLDCORPUS
include ../../_includes/_mixins
p
| An annotated corpus, using the JSON file format. Manages annotations for
| tagging, dependency parsing and NER.
+h(2, "init") GoldCorpus.__init__
+tag method
p Create a #[code GoldCorpus].
+table(["Name", "Type", "Description"])
+row
+cell #[code train_path]
+cell unicode or #[code Path]
+cell File or directory of training data.
+row
+cell #[code dev_path]
+cell unicode or #[code Path]
+cell File or directory of development data.

View File

@ -7,7 +7,7 @@ p Collection for training annotations.
+h(2, "init") GoldParse.__init__ +h(2, "init") GoldParse.__init__
+tag method +tag method
p Create a GoldParse. p Create a #[code GoldParse].
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row

View File

@ -82,6 +82,41 @@ p
+cell #[code Doc] +cell #[code Doc]
+cell A container for accessing the annotations. +cell A container for accessing the annotations.
+h(2, "pipe") Language.pipe
+tag method
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+footrow
+cell yields
+cell #[code Doc]
+cell Documents in the order of the original text.
+h(2, "update") Language.update +h(2, "update") Language.update
+tag method +tag method
@ -172,40 +207,23 @@ p
+cell - +cell -
+cell Config parameters. +cell Config parameters.
+h(2, "pipe") Language.pipe +h(2, "preprocess_gold") Language.preprocess_gold
+tag method
p p
| Process texts as a stream, and yield #[code Doc] objects in order. | Can be called before training to pre-process gold data. By default, it
| Supports GIL-free multi-threading. | handles nonprojectivity and adds missing tags to the tag map.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code texts] +cell #[code docs_golds]
+cell - +cell iterable
+cell A sequence of unicode objects. +cell Tuples of #[code Doc] and #[code GoldParse] objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+footrow +footrow
+cell yields +cell yields
+cell #[code Doc] +cell tuple
+cell Documents in the order of the original text. +cell Tuples of #[code Doc] and #[code GoldParse] objects.
+h(2, "to_disk") Language.to_disk +h(2, "to_disk") Language.to_disk
+tag method +tag method