Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-05-22 05:39:18 -05:00
commit 70a8c531cd
17 changed files with 206 additions and 348 deletions

View File

@ -3,127 +3,21 @@ from __future__ import print_function
# NB! This breaks in plac on Python 2!!
#from __future__ import unicode_literals
import plac
from spacy.cli import download as cli_download
from spacy.cli import link as cli_link
from spacy.cli import info as cli_info
from spacy.cli import package as cli_package
from spacy.cli import train as cli_train
from spacy.cli import model as cli_model
from spacy.cli import convert as cli_convert
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False):
"""
Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
cli_download(model, direct)
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""
Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
cli_link(origin, link_name, force)
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False):
"""
Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
cli_info(model, markdown)
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta=None, force=False):
"""
Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
cli_package(input_dir, output_dir, meta, force)
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(input_file, output_dir, n_sents=10, morphology=False):
"""
Convert files into JSON format for use with train command and other
experiment management functions.
"""
cli_convert(input_file, output_dir, n_sents, morphology)
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
nsents=("number of sentences", "option", None, int),
use_gpu=("Use GPU", "flag", "g", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(lang, output_dir, train_data, dev_data=None, n_iter=15,
nsents=0, use_gpu=False,
no_tagger=False, no_parser=False, no_entities=False):
"""
Train a model. Expects data in spaCy's JSON format.
"""
nsents = nsents or None
cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
use_gpu, no_tagger, no_parser, no_entities)
if __name__ == '__main__':
import plac
import sys
commands = {
'train': train,
'convert': convert,
'download': download,
'link': link,
'info': info,
'package': package,
}
from spacy.cli import download, link, info, package, train, convert
from spacy.util import prints
commands = {'download': download, 'link': link, 'info': info, 'train': train,
'convert': convert, 'package': package}
if len(sys.argv) == 1:
print("Available commands: %s" % ', '.join(sorted(commands)))
sys.exit(1)
prints(', '.join(commands), title="Available commands", exits=1)
command = sys.argv.pop(1)
sys.argv[0] = 'spacy %s' % command
if command in commands:
plac.call(commands[command])
else:
print("Unknown command: %s. Available: %s" % (command, ', '.join(commands)))
sys.exit(1)
prints("Available: %s" % ', '.join(commands),
title="Unknown command: %s" % command, exits=1)

View File

@ -3,5 +3,4 @@ from .info import info
from .link import link
from .package import package
from .train import train
from .model import model
from .convert import convert

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
from pathlib import Path
from .converters import conllu2json, iob2json
@ -18,15 +19,24 @@ CONVERTERS = {
}
def convert(input_file, output_dir, *args):
@plac.annotations(
input_file=("input file", "positional", None, str),
output_dir=("output directory for converted file", "positional", None, str),
n_sents=("Number of sentences per doc", "option", "n", float),
morphology=("Enable appending morphology to tags", "flag", "m", bool)
)
def convert(input_file, output_dir, n_sents, morphology):
"""Convert files into JSON format for use with train command and other
experiment management functions.
"""
input_path = Path(input_file)
output_path = Path(output_dir)
if not input_path.exists():
prints(input_path, title="Input file not found", exits=True)
prints(input_path, title="Input file not found", exits=1)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True)
prints(output_path, title="Output directory not found", exits=1)
file_ext = input_path.suffix
if not file_ext in CONVERTERS:
prints("Can't find converter for %s" % input_path.parts[-1],
title="Unknown format", exits=True)
title="Unknown format", exits=1)
CONVERTERS[file_ext](input_path, output_path, *args)

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import requests
import os
import subprocess
@ -11,7 +12,16 @@ from ..util import prints
from .. import about
@plac.annotations(
model=("model to download (shortcut or model name)", "positional", None, str),
direct=("force direct download. Needs model name with version and won't "
"perform compatibility check", "flag", "d", bool)
)
def download(model, direct=False):
"""Download compatible model from default download path using pip. Model
can be shortcut, model name or, if --direct flag is set, full model name
with version.
"""
if direct:
download_model('{m}/{m}.tar.gz'.format(m=model))
else:
@ -38,7 +48,7 @@ def get_json(url, desc):
if r.status_code != 200:
prints("Couldn't fetch %s. Please find a model for your spaCy installation "
"(v%s), and download it manually." % (desc, about.__version__),
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True)
about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
return r.json()
@ -48,7 +58,7 @@ def get_compatibility():
comp = comp_table['spacy']
if version not in comp:
prints("No compatible models found for v%s of spaCy." % version,
title="Compatibility error", exits=True)
title="Compatibility error", exits=1)
return comp[version]
@ -56,7 +66,7 @@ def get_version(model, comp):
if model not in comp:
version = about.__version__
prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
title="Compatibility error", exits=True)
title="Compatibility error", exits=1)
return comp[model][0]

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import platform
from pathlib import Path
@ -9,7 +10,15 @@ from .. import about
from .. import util
@plac.annotations(
model=("optional: shortcut link of model", "positional", None, str),
markdown=("generate Markdown for GitHub issues", "flag", "md", str)
)
def info(model=None, markdown=False):
"""Print info about spaCy installation. If a model shortcut link is
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
if model:
model_path = util.resolve_model_path(model)
meta = util.parse_package_meta(model_path)

View File

@ -1,24 +1,35 @@
# coding: utf8
from __future__ import unicode_literals
import plac
from pathlib import Path
from ..compat import symlink_to, path2str
from ..util import prints
from .. import util
@plac.annotations(
origin=("package name or local path to model", "positional", None, str),
link_name=("name of shortuct link to create", "positional", None, str),
force=("force overwriting of existing link", "flag", "f", bool)
)
def link(origin, link_name, force=False):
"""Create a symlink for models within the spacy/data directory. Accepts
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
if util.is_package(origin):
model_path = util.get_model_package_path(origin)
else:
model_path = Path(origin)
if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=True)
title="Can't locate model data", exits=1)
link_path = util.get_data_path() / link_name
if link_path.exists() and not force:
prints("To overwrite an existing link, use the --force flag.",
title="Link %s already exists" % link_name, exits=True)
title="Link %s already exists" % link_name, exits=1)
elif link_path.exists():
link_path.unlink()
try:

View File

@ -1,122 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
import gzip
import math
from ast import literal_eval
from preshed.counter import PreshCounter
from ..vocab import write_binary_vectors
from ..compat import fix_text, path2str
from ..util import prints
from .. import util
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
model_path = util.ensure_path(model_dir)
freqs_path = util.ensure_path(freqs_data)
clusters_path = util.ensure_path(clusters_data)
vectors_path = util.ensure_path(vectors_data)
if not freqs_path.is_file():
prints(freqs_path, title="No frequencies file found", exits=True)
if clusters_path and not clusters_path.is_file():
prints(clusters_path, title="No Brown clusters file found", exits=True)
if vectors_path and not vectors_path.is_file():
prints(vectors_path, title="No word vectors file found", exits=True)
vocab = util.get_lang_class(lang).Defaults.create_vocab()
probs, oov_prob = read_probs(freqs_path)
clusters = read_clusters(clusters_path) if clusters_path else {}
populate_vocab(vocab, clusters, probs, oov_prob)
create_model(model_path, vectors_path, vocab, oov_prob)
def create_model(model_path, vectors_path, vocab, oov_prob):
vocab_path = model_path / 'vocab'
lexemes_path = vocab_path / 'lexemes.bin'
strings_path = vocab_path / 'strings.json'
oov_path = vocab_path / 'oov_prob'
if not model_path.exists():
model_path.mkdir()
if not vocab_path.exists():
vocab_path.mkdir()
vocab.dump(path2str(lexemes_path))
with strings_path.open('w') as f:
vocab.strings.dump(f)
with oov_path.open('w') as f:
f.write('%f' % oov_prob)
if vectors_path:
vectors_dest = vocab_path / 'vec.bin'
write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
counts = PreshCounter()
total = 0
freqs_file = check_unzip(freqs_path)
for i, line in enumerate(freqs_file):
freq, doc_freq, key = line.rstrip().split('\t', 2)
freq = int(freq)
counts.inc(i+1, freq)
total += freq
counts.smooth()
log_total = math.log(total)
freqs_file = check_unzip(freqs_path)
probs = {}
for line in freqs_file:
freq, doc_freq, key = line.rstrip().split('\t', 2)
doc_freq = int(doc_freq)
freq = int(freq)
if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
word = literal_eval(key)
smooth_count = counts.smoother(int(freq))
probs[word] = math.log(smooth_count) - log_total
oov_prob = math.log(counts.smoother(0)) - log_total
return probs, oov_prob
def read_clusters(clusters_path):
clusters = {}
with clusters_path.open() as f:
for line in f:
try:
cluster, word, freq = line.split()
word = fix_text(word)
except ValueError:
continue
# If the clusterer has only seen the word a few times, its
# cluster is unreliable.
if int(freq) >= 3:
clusters[word] = cluster
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
clusters[word.title()] = cluster
if word.upper() not in clusters:
clusters[word.upper()] = cluster
return clusters
def populate_vocab(vocab, clusters, probs, oov_prob):
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
lexeme = vocab[word]
lexeme.prob = prob
lexeme.is_oov = False
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
lexeme.cluster = int(clusters[word][::-1], 2)
else:
lexeme.cluster = 0
def check_unzip(file_path):
file_path_str = path2str(file_path)
if file_path_str.endswith('gz'):
return gzip.open(file_path_str)
else:
return file_path.open()

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import plac
import shutil
import requests
from pathlib import Path
@ -11,16 +12,26 @@ from .. import util
from .. import about
def package(input_dir, output_dir, meta_path, force):
@plac.annotations(
input_dir=("directory with model data", "positional", None, str),
output_dir=("output parent directory", "positional", None, str),
meta=("path to meta.json", "option", "m", str),
force=("force overwriting of existing folder in output directory", "flag", "f", bool)
)
def package(input_dir, output_dir, meta, force):
"""Generate Python package for model data, including meta and required
installation files. A new directory will be created in the specified
output directory, and model data will be copied over.
"""
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
meta_path = util.ensure_path(meta)
if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=True)
prints(input_path, title="Model directory not found", exits=1)
if not output_path or not output_path.exists():
prints(output_path, title="Output directory not found", exits=True)
prints(output_path, title="Output directory not found", exits=1)
if meta_path and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=True)
prints(meta_path, title="meta.json not found", exits=1)
template_setup = get_template('setup.py')
template_manifest = get_template('MANIFEST.in')
@ -55,7 +66,7 @@ def create_dirs(package_path, force):
else:
prints(package_path, "Please delete the directory and try again, or "
"use the --force flag to overwrite existing directories.",
title="Package directory already exists", exits=True)
title="Package directory already exists", exits=1)
Path.mkdir(package_path, parents=True)
@ -87,12 +98,12 @@ def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=True)
title='No "%s" setting found in meta.json' % key, exits=1)
def get_template(filepath):
r = requests.get(about.__model_files__ + filepath)
if r.status_code != 200:
prints("Couldn't fetch template files from GitHub.",
title="Server error (%d)" % r.status_code, exits=True)
title="Server error (%d)" % r.status_code, exits=1)
return r.text

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals, division, print_function
import plac
import json
from collections import defaultdict
import cytoolz
@ -18,19 +19,33 @@ from .. import util
from .. import displacy
def train(lang_id, output_dir, train_data, dev_data, n_iter, n_sents,
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
train_data=("location of JSON-formatted training data", "positional", None, str),
dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
n_iter=("number of iterations", "option", "n", int),
n_sents=("number of sentences", "option", "ns", int),
use_gpu=("Use GPU", "flag", "G", bool),
no_tagger=("Don't train tagger", "flag", "T", bool),
no_parser=("Don't train parser", "flag", "P", bool),
no_entities=("Don't train NER", "flag", "N", bool)
)
def train(lang, output_dir, train_data, dev_data, n_iter, n_sents,
use_gpu, no_tagger, no_parser, no_entities):
"""Train a model. Expects data in spaCy's JSON format."""
n_sents = n_sents or None
output_path = util.ensure_path(output_dir)
train_path = util.ensure_path(train_data)
dev_path = util.ensure_path(dev_data)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=True)
prints(output_path, title="Output directory not found", exits=1)
if not train_path.exists():
prints(train_path, title="Training data not found", exits=True)
prints(train_path, title="Training data not found", exits=1)
if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=True)
prints(dev_path, title="Development data not found", exits=1)
lang_class = util.get_lang_class(lang_id)
lang_class = util.get_lang_class(lang)
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags')

View File

@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words):
class GoldCorpus(object):
'''An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing, NER.'''
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path):
"""Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data.
dev_path (unicode or Path): File or directory of development data.
"""
self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path)
self.train_locs = self.walk_corpus(self.train_path)

View File

@ -236,6 +236,12 @@ class Language(object):
doc.tensor = None
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)

View File

@ -380,13 +380,13 @@ def prints(*texts, **kwargs):
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
**kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
"""
exits = kwargs.get('exits', False)
exits = kwargs.get('exits', None)
title = kwargs.get('title', None)
title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
message = '\n\n'.join([_wrap(text) for text in texts])
print('\n{}{}\n'.format(title, message))
if exits:
sys.exit(0)
if exits is not None:
sys.exit(exits)
def _wrap(text, wrap_max=80, indent=4):

View File

@ -23,7 +23,8 @@
"Lexeme": "lexeme",
"Vocab": "vocab",
"StringStore": "stringstore",
"GoldParse": "goldparse"
"GoldParse": "goldparse",
"GoldCorpus": "goldcorpus"
},
"Other": {
"Annotation Specs": "annotation",
@ -135,6 +136,11 @@
"tag": "class"
},
"goldcorpus": {
"title": "GoldCorpus",
"tag": "class"
},
"annotation": {
"title": "Annotation Specifications"
},

View File

@ -5,16 +5,23 @@ include ../../_includes/_mixins
p
| As of v1.7.0, spaCy comes with new command line helpers to download and
| link models and show useful debugging information. For a list of available
| commands, type #[code python -m spacy --help].
| commands, type #[code python -m spacy]. To make the command even more
| convenient, we recommend
| #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
| mapping #[code python -m spacy] to #[code spacy].
+aside("Why python -m?")
| The problem with a global entry point is that it's resolved by looking up
| entries in your #[code PATH] environment variable. This can give you
| unexpected results, like executing the wrong spaCy installation
| (especially when using #[code virtualenv]). #[code python -m] prevents
| fallbacks to system modules and makes sure the correct spaCy version is
| used. If you hate typing it every time, we recommend creating an
| #[code alias] instead.
| unexpected results, like executing the wrong spaCy installation.
| #[code python -m] prevents fallbacks to system modules.
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code model] command to initialise a model data
| directory is deprecated. The command was only necessary because previous
| versions of spaCy expected a model directory to already be set up. This
| has since been changed, so you can use the #[+api("cli#train") #[code train]]
| command straight away.
+h(2, "download") Download
@ -45,7 +52,7 @@ p
+cell flag
+cell Show help message and available arguments.
+infobox("Important note")
+aside("Downloading best practices")
| The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints
| detailed messages in case things go wrong. It's #[strong not recommended]
@ -116,7 +123,6 @@ p
+cell Show help message and available arguments.
+h(2, "convert") Convert
+tag experimental
p
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
@ -153,49 +159,7 @@ p
+cell flag
+cell Show help message and available arguments.
+h(2, "model") Model
+tag experimental
p
| Initialise a new model and its data directory. For more info on this, see
| the documentation on #[+a("/docs/usage/adding-languages") adding languages].
+code(false, "bash").
python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
+table(["Argument", "Type", "Description"])
+row
+cell #[code lang]
+cell positional
+cell Model language.
+row
+cell #[code model_dir]
+cell positional
+cell Output directory to store the model in.
+row
+cell #[code freqs_data]
+cell positional
+cell Tab-separated frequencies file.
+row
+cell #[code clusters_data]
+cell positional
+cell Brown custers file (optional).
+row
+cell #[code vectors_data]
+cell positional
+cell Word vectors file (optional).
+row
+cell #[code --help], #[code -h]
+cell flag
+cell Show help message and available arguments.
+h(2, "train") Train
+tag experimental
p
| Train a model. Expects data in spaCy's
@ -231,7 +195,7 @@ p
+cell Number of iterations (default: #[code 15]).
+row
+cell #[code --nsents]
+cell #[code --n_sents], #[code -ns]
+cell option
+cell Number of sentences (default: #[code 0]).
@ -241,7 +205,7 @@ p
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row
+cell #[code --use-gpu], #[code -g]
+cell #[code --use-gpu], #[code -G]
+cell flag
+cell Use GPU.
@ -266,17 +230,16 @@ p
+cell Show help message and available arguments.
+h(2, "package") Package
+tag experimental
p
| Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
| from an existing model data directory. All data files are copied over.
| If the path to a meta.json is supplied, or a meta.json is found in the
| input directory, this file is used. Otherwise, the data can be entered
| directly from the command line. While this feature is still experimental,
| the required file templates are downloaded from
| #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means
| you need to be connected to the internet to use this command.
| directly from the command line. The required file templates are downloaded
| from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
| sure you're always using the latest versions. This means you need to be
| connected to the internet to use this command.
+code(false, "bash").
python -m spacy package [input_dir] [output_dir] [--meta] [--force]

View File

@ -0,0 +1,23 @@
//- 💫 DOCS > API > GOLDCORPUS
include ../../_includes/_mixins
p
| An annotated corpus, using the JSON file format. Manages annotations for
| tagging, dependency parsing and NER.
+h(2, "init") GoldCorpus.__init__
+tag method
p Create a #[code GoldCorpus].
+table(["Name", "Type", "Description"])
+row
+cell #[code train_path]
+cell unicode or #[code Path]
+cell File or directory of training data.
+row
+cell #[code dev_path]
+cell unicode or #[code Path]
+cell File or directory of development data.

View File

@ -7,7 +7,7 @@ p Collection for training annotations.
+h(2, "init") GoldParse.__init__
+tag method
p Create a GoldParse.
p Create a #[code GoldParse].
+table(["Name", "Type", "Description"])
+row

View File

@ -82,6 +82,41 @@ p
+cell #[code Doc]
+cell A container for accessing the annotations.
+h(2, "pipe") Language.pipe
+tag method
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+footrow
+cell yields
+cell #[code Doc]
+cell Documents in the order of the original text.
+h(2, "update") Language.update
+tag method
@ -172,40 +207,23 @@ p
+cell -
+cell Config parameters.
+h(2, "pipe") Language.pipe
+tag method
+h(2, "preprocess_gold") Language.preprocess_gold
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
| Can be called before training to pre-process gold data. By default, it
| handles nonprojectivity and adds missing tags to the tag map.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+cell #[code docs_golds]
+cell iterable
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
+footrow
+cell yields
+cell #[code Doc]
+cell Documents in the order of the original text.
+cell tuple
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
+h(2, "to_disk") Language.to_disk
+tag method