diff --git a/spacy/__main__.py b/spacy/__main__.py index 69672c4b3..214a7b617 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -3,127 +3,21 @@ from __future__ import print_function # NB! This breaks in plac on Python 2!! #from __future__ import unicode_literals -import plac -from spacy.cli import download as cli_download -from spacy.cli import link as cli_link -from spacy.cli import info as cli_info -from spacy.cli import package as cli_package -from spacy.cli import train as cli_train -from spacy.cli import model as cli_model -from spacy.cli import convert as cli_convert - - -@plac.annotations( - model=("model to download (shortcut or model name)", "positional", None, str), - direct=("force direct download. Needs model name with version and won't " - "perform compatibility check", "flag", "d", bool) -) -def download(model, direct=False): - """ - Download compatible model from default download path using pip. Model - can be shortcut, model name or, if --direct flag is set, full model name - with version. - """ - cli_download(model, direct) - - -@plac.annotations( - origin=("package name or local path to model", "positional", None, str), - link_name=("name of shortuct link to create", "positional", None, str), - force=("force overwriting of existing link", "flag", "f", bool) -) -def link(origin, link_name, force=False): - """ - Create a symlink for models within the spacy/data directory. Accepts - either the name of a pip package, or the local path to the model data - directory. Linking models allows loading them via spacy.load(link_name). - """ - cli_link(origin, link_name, force) - - -@plac.annotations( - model=("optional: shortcut link of model", "positional", None, str), - markdown=("generate Markdown for GitHub issues", "flag", "md", str) -) -def info(model=None, markdown=False): - """ - Print info about spaCy installation. If a model shortcut link is - speficied as an argument, print model information. Flag --markdown - prints details in Markdown for easy copy-pasting to GitHub issues. - """ - cli_info(model, markdown) - - -@plac.annotations( - input_dir=("directory with model data", "positional", None, str), - output_dir=("output parent directory", "positional", None, str), - meta=("path to meta.json", "option", "m", str), - force=("force overwriting of existing folder in output directory", "flag", "f", bool) -) -def package(input_dir, output_dir, meta=None, force=False): - """ - Generate Python package for model data, including meta and required - installation files. A new directory will be created in the specified - output directory, and model data will be copied over. - """ - cli_package(input_dir, output_dir, meta, force) - - -@plac.annotations( - input_file=("input file", "positional", None, str), - output_dir=("output directory for converted file", "positional", None, str), - n_sents=("Number of sentences per doc", "option", "n", float), - morphology=("Enable appending morphology to tags", "flag", "m", bool) -) -def convert(input_file, output_dir, n_sents=10, morphology=False): - """ - Convert files into JSON format for use with train command and other - experiment management functions. - """ - cli_convert(input_file, output_dir, n_sents, morphology) - - -@plac.annotations( - lang=("model language", "positional", None, str), - output_dir=("output directory to store model in", "positional", None, str), - train_data=("location of JSON-formatted training data", "positional", None, str), - dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), - n_iter=("number of iterations", "option", "n", int), - nsents=("number of sentences", "option", None, int), - use_gpu=("Use GPU", "flag", "g", bool), - no_tagger=("Don't train tagger", "flag", "T", bool), - no_parser=("Don't train parser", "flag", "P", bool), - no_entities=("Don't train NER", "flag", "N", bool) -) -def train(lang, output_dir, train_data, dev_data=None, n_iter=15, - nsents=0, use_gpu=False, - no_tagger=False, no_parser=False, no_entities=False): - """ - Train a model. Expects data in spaCy's JSON format. - """ - nsents = nsents or None - cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents, - use_gpu, no_tagger, no_parser, no_entities) - if __name__ == '__main__': import plac import sys - commands = { - 'train': train, - 'convert': convert, - 'download': download, - 'link': link, - 'info': info, - 'package': package, - } + from spacy.cli import download, link, info, package, train, convert + from spacy.util import prints + + commands = {'download': download, 'link': link, 'info': info, 'train': train, + 'convert': convert, 'package': package} if len(sys.argv) == 1: - print("Available commands: %s" % ', '.join(sorted(commands))) - sys.exit(1) + prints(', '.join(commands), title="Available commands", exits=1) command = sys.argv.pop(1) sys.argv[0] = 'spacy %s' % command if command in commands: plac.call(commands[command]) else: - print("Unknown command: %s. Available: %s" % (command, ', '.join(commands))) - sys.exit(1) + prints("Available: %s" % ', '.join(commands), + title="Unknown command: %s" % command, exits=1) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 4ec6fe678..2b4f98a88 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -3,5 +3,4 @@ from .info import info from .link import link from .package import package from .train import train -from .model import model from .convert import convert diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 0b2800205..c7730ab9e 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +import plac from pathlib import Path from .converters import conllu2json, iob2json @@ -18,15 +19,24 @@ CONVERTERS = { } -def convert(input_file, output_dir, *args): +@plac.annotations( + input_file=("input file", "positional", None, str), + output_dir=("output directory for converted file", "positional", None, str), + n_sents=("Number of sentences per doc", "option", "n", float), + morphology=("Enable appending morphology to tags", "flag", "m", bool) +) +def convert(input_file, output_dir, n_sents, morphology): + """Convert files into JSON format for use with train command and other + experiment management functions. + """ input_path = Path(input_file) output_path = Path(output_dir) if not input_path.exists(): - prints(input_path, title="Input file not found", exits=True) + prints(input_path, title="Input file not found", exits=1) if not output_path.exists(): - prints(output_path, title="Output directory not found", exits=True) + prints(output_path, title="Output directory not found", exits=1) file_ext = input_path.suffix if not file_ext in CONVERTERS: prints("Can't find converter for %s" % input_path.parts[-1], - title="Unknown format", exits=True) + title="Unknown format", exits=1) CONVERTERS[file_ext](input_path, output_path, *args) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index d6f151c93..fdcacb891 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +import plac import requests import os import subprocess @@ -11,7 +12,16 @@ from ..util import prints from .. import about +@plac.annotations( + model=("model to download (shortcut or model name)", "positional", None, str), + direct=("force direct download. Needs model name with version and won't " + "perform compatibility check", "flag", "d", bool) +) def download(model, direct=False): + """Download compatible model from default download path using pip. Model + can be shortcut, model name or, if --direct flag is set, full model name + with version. + """ if direct: download_model('{m}/{m}.tar.gz'.format(m=model)) else: @@ -38,7 +48,7 @@ def get_json(url, desc): if r.status_code != 200: prints("Couldn't fetch %s. Please find a model for your spaCy installation " "(v%s), and download it manually." % (desc, about.__version__), - about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True) + about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1) return r.json() @@ -48,7 +58,7 @@ def get_compatibility(): comp = comp_table['spacy'] if version not in comp: prints("No compatible models found for v%s of spaCy." % version, - title="Compatibility error", exits=True) + title="Compatibility error", exits=1) return comp[version] @@ -56,7 +66,7 @@ def get_version(model, comp): if model not in comp: version = about.__version__ prints("No compatible model found for '%s' (spaCy v%s)." % (model, version), - title="Compatibility error", exits=True) + title="Compatibility error", exits=1) return comp[model][0] diff --git a/spacy/cli/info.py b/spacy/cli/info.py index f55d76a2c..6f7467521 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +import plac import platform from pathlib import Path @@ -9,7 +10,15 @@ from .. import about from .. import util +@plac.annotations( + model=("optional: shortcut link of model", "positional", None, str), + markdown=("generate Markdown for GitHub issues", "flag", "md", str) +) def info(model=None, markdown=False): + """Print info about spaCy installation. If a model shortcut link is + speficied as an argument, print model information. Flag --markdown + prints details in Markdown for easy copy-pasting to GitHub issues. + """ if model: model_path = util.resolve_model_path(model) meta = util.parse_package_meta(model_path) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 20d0473a3..1feef8bce 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -1,24 +1,35 @@ # coding: utf8 from __future__ import unicode_literals +import plac from pathlib import Path + from ..compat import symlink_to, path2str from ..util import prints from .. import util +@plac.annotations( + origin=("package name or local path to model", "positional", None, str), + link_name=("name of shortuct link to create", "positional", None, str), + force=("force overwriting of existing link", "flag", "f", bool) +) def link(origin, link_name, force=False): + """Create a symlink for models within the spacy/data directory. Accepts + either the name of a pip package, or the local path to the model data + directory. Linking models allows loading them via spacy.load(link_name). + """ if util.is_package(origin): model_path = util.get_model_package_path(origin) else: model_path = Path(origin) if not model_path.exists(): prints("The data should be located in %s" % path2str(model_path), - title="Can't locate model data", exits=True) + title="Can't locate model data", exits=1) link_path = util.get_data_path() / link_name if link_path.exists() and not force: prints("To overwrite an existing link, use the --force flag.", - title="Link %s already exists" % link_name, exits=True) + title="Link %s already exists" % link_name, exits=1) elif link_path.exists(): link_path.unlink() try: diff --git a/spacy/cli/model.py b/spacy/cli/model.py deleted file mode 100644 index c69499f50..000000000 --- a/spacy/cli/model.py +++ /dev/null @@ -1,122 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import gzip -import math -from ast import literal_eval -from preshed.counter import PreshCounter - -from ..vocab import write_binary_vectors -from ..compat import fix_text, path2str -from ..util import prints -from .. import util - - -def model(lang, model_dir, freqs_data, clusters_data, vectors_data): - model_path = util.ensure_path(model_dir) - freqs_path = util.ensure_path(freqs_data) - clusters_path = util.ensure_path(clusters_data) - vectors_path = util.ensure_path(vectors_data) - if not freqs_path.is_file(): - prints(freqs_path, title="No frequencies file found", exits=True) - if clusters_path and not clusters_path.is_file(): - prints(clusters_path, title="No Brown clusters file found", exits=True) - if vectors_path and not vectors_path.is_file(): - prints(vectors_path, title="No word vectors file found", exits=True) - vocab = util.get_lang_class(lang).Defaults.create_vocab() - probs, oov_prob = read_probs(freqs_path) - clusters = read_clusters(clusters_path) if clusters_path else {} - populate_vocab(vocab, clusters, probs, oov_prob) - create_model(model_path, vectors_path, vocab, oov_prob) - - -def create_model(model_path, vectors_path, vocab, oov_prob): - vocab_path = model_path / 'vocab' - lexemes_path = vocab_path / 'lexemes.bin' - strings_path = vocab_path / 'strings.json' - oov_path = vocab_path / 'oov_prob' - - if not model_path.exists(): - model_path.mkdir() - if not vocab_path.exists(): - vocab_path.mkdir() - vocab.dump(path2str(lexemes_path)) - with strings_path.open('w') as f: - vocab.strings.dump(f) - with oov_path.open('w') as f: - f.write('%f' % oov_prob) - if vectors_path: - vectors_dest = vocab_path / 'vec.bin' - write_binary_vectors(path2str(vectors_path), path2str(vectors_dest)) - - -def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): - counts = PreshCounter() - total = 0 - freqs_file = check_unzip(freqs_path) - for i, line in enumerate(freqs_file): - freq, doc_freq, key = line.rstrip().split('\t', 2) - freq = int(freq) - counts.inc(i+1, freq) - total += freq - counts.smooth() - log_total = math.log(total) - freqs_file = check_unzip(freqs_path) - probs = {} - for line in freqs_file: - freq, doc_freq, key = line.rstrip().split('\t', 2) - doc_freq = int(doc_freq) - freq = int(freq) - if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: - word = literal_eval(key) - smooth_count = counts.smoother(int(freq)) - probs[word] = math.log(smooth_count) - log_total - oov_prob = math.log(counts.smoother(0)) - log_total - return probs, oov_prob - - -def read_clusters(clusters_path): - clusters = {} - with clusters_path.open() as f: - for line in f: - try: - cluster, word, freq = line.split() - word = fix_text(word) - except ValueError: - continue - # If the clusterer has only seen the word a few times, its - # cluster is unreliable. - if int(freq) >= 3: - clusters[word] = cluster - else: - clusters[word] = '0' - # Expand clusters with re-casing - for word, cluster in list(clusters.items()): - if word.lower() not in clusters: - clusters[word.lower()] = cluster - if word.title() not in clusters: - clusters[word.title()] = cluster - if word.upper() not in clusters: - clusters[word.upper()] = cluster - return clusters - - -def populate_vocab(vocab, clusters, probs, oov_prob): - for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - lexeme = vocab[word] - lexeme.prob = prob - lexeme.is_oov = False - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See _parse_features.pyx - if word in clusters: - lexeme.cluster = int(clusters[word][::-1], 2) - else: - lexeme.cluster = 0 - - -def check_unzip(file_path): - file_path_str = path2str(file_path) - if file_path_str.endswith('gz'): - return gzip.open(file_path_str) - else: - return file_path.open() diff --git a/spacy/cli/package.py b/spacy/cli/package.py index e6366c44e..9acd0a2fa 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals +import plac import shutil import requests from pathlib import Path @@ -11,16 +12,26 @@ from .. import util from .. import about -def package(input_dir, output_dir, meta_path, force): +@plac.annotations( + input_dir=("directory with model data", "positional", None, str), + output_dir=("output parent directory", "positional", None, str), + meta=("path to meta.json", "option", "m", str), + force=("force overwriting of existing folder in output directory", "flag", "f", bool) +) +def package(input_dir, output_dir, meta, force): + """Generate Python package for model data, including meta and required + installation files. A new directory will be created in the specified + output directory, and model data will be copied over. + """ input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) - meta_path = util.ensure_path(meta_path) + meta_path = util.ensure_path(meta) if not input_path or not input_path.exists(): - prints(input_path, title="Model directory not found", exits=True) + prints(input_path, title="Model directory not found", exits=1) if not output_path or not output_path.exists(): - prints(output_path, title="Output directory not found", exits=True) + prints(output_path, title="Output directory not found", exits=1) if meta_path and not meta_path.exists(): - prints(meta_path, title="meta.json not found", exits=True) + prints(meta_path, title="meta.json not found", exits=1) template_setup = get_template('setup.py') template_manifest = get_template('MANIFEST.in') @@ -55,7 +66,7 @@ def create_dirs(package_path, force): else: prints(package_path, "Please delete the directory and try again, or " "use the --force flag to overwrite existing directories.", - title="Package directory already exists", exits=True) + title="Package directory already exists", exits=1) Path.mkdir(package_path, parents=True) @@ -87,12 +98,12 @@ def validate_meta(meta, keys): for key in keys: if key not in meta or meta[key] == '': prints("This setting is required to build your package.", - title='No "%s" setting found in meta.json' % key, exits=True) + title='No "%s" setting found in meta.json' % key, exits=1) def get_template(filepath): r = requests.get(about.__model_files__ + filepath) if r.status_code != 200: prints("Couldn't fetch template files from GitHub.", - title="Server error (%d)" % r.status_code, exits=True) + title="Server error (%d)" % r.status_code, exits=1) return r.text diff --git a/spacy/cli/train.py b/spacy/cli/train.py index a25a7f252..a9a5cd536 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals, division, print_function +import plac import json from collections import defaultdict import cytoolz @@ -18,19 +19,33 @@ from .. import util from .. import displacy -def train(lang_id, output_dir, train_data, dev_data, n_iter, n_sents, +@plac.annotations( + lang=("model language", "positional", None, str), + output_dir=("output directory to store model in", "positional", None, str), + train_data=("location of JSON-formatted training data", "positional", None, str), + dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), + n_iter=("number of iterations", "option", "n", int), + n_sents=("number of sentences", "option", "ns", int), + use_gpu=("Use GPU", "flag", "G", bool), + no_tagger=("Don't train tagger", "flag", "T", bool), + no_parser=("Don't train parser", "flag", "P", bool), + no_entities=("Don't train NER", "flag", "N", bool) +) +def train(lang, output_dir, train_data, dev_data, n_iter, n_sents, use_gpu, no_tagger, no_parser, no_entities): + """Train a model. Expects data in spaCy's JSON format.""" + n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) if not output_path.exists(): - prints(output_path, title="Output directory not found", exits=True) + prints(output_path, title="Output directory not found", exits=1) if not train_path.exists(): - prints(train_path, title="Training data not found", exits=True) + prints(train_path, title="Training data not found", exits=1) if dev_path and not dev_path.exists(): - prints(dev_path, title="Development data not found", exits=True) + prints(dev_path, title="Development data not found", exits=1) - lang_class = util.get_lang_class(lang_id) + lang_class = util.get_lang_class(lang) pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] if no_tagger and 'tags' in pipeline: pipeline.remove('tags') diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 45b95b379..bc34290f4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words): class GoldCorpus(object): - '''An annotated corpus, using the JSON file format. Manages - annotations for tagging, dependency parsing, NER.''' + """An annotated corpus, using the JSON file format. Manages + annotations for tagging, dependency parsing and NER.""" def __init__(self, train_path, dev_path): + """Create a GoldCorpus. + + train_path (unicode or Path): File or directory of training data. + dev_path (unicode or Path): File or directory of development data. + """ self.train_path = util.ensure_path(train_path) self.dev_path = util.ensure_path(dev_path) self.train_locs = self.walk_corpus(self.train_path) diff --git a/spacy/language.py b/spacy/language.py index 58cee80ac..37f7ae207 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -236,6 +236,12 @@ class Language(object): doc.tensor = None def preprocess_gold(self, docs_golds): + """Can be called before training to pre-process gold data. By default, + it handles nonprojectivity and adds missing tags to the tag map. + + docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. + YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. + """ for proc in self.pipeline: if hasattr(proc, 'preprocess_gold'): docs_golds = proc.preprocess_gold(docs_golds) diff --git a/spacy/util.py b/spacy/util.py index 6d406a36a..f27df54a8 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -380,13 +380,13 @@ def prints(*texts, **kwargs): *texts (unicode): Texts to print. Each argument is rendered as paragraph. **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit. """ - exits = kwargs.get('exits', False) + exits = kwargs.get('exits', None) title = kwargs.get('title', None) title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else '' message = '\n\n'.join([_wrap(text) for text in texts]) print('\n{}{}\n'.format(title, message)) - if exits: - sys.exit(0) + if exits is not None: + sys.exit(exits) def _wrap(text, wrap_max=80, indent=4): diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 900a42553..443ee9a67 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -23,7 +23,8 @@ "Lexeme": "lexeme", "Vocab": "vocab", "StringStore": "stringstore", - "GoldParse": "goldparse" + "GoldParse": "goldparse", + "GoldCorpus": "goldcorpus" }, "Other": { "Annotation Specs": "annotation", @@ -135,6 +136,11 @@ "tag": "class" }, + "goldcorpus": { + "title": "GoldCorpus", + "tag": "class" + }, + "annotation": { "title": "Annotation Specifications" }, diff --git a/website/docs/api/cli.jade b/website/docs/api/cli.jade index d600bf5f0..b78d4b7c9 100644 --- a/website/docs/api/cli.jade +++ b/website/docs/api/cli.jade @@ -5,16 +5,23 @@ include ../../_includes/_mixins p | As of v1.7.0, spaCy comes with new command line helpers to download and | link models and show useful debugging information. For a list of available - | commands, type #[code python -m spacy --help]. + | commands, type #[code python -m spacy]. To make the command even more + | convenient, we recommend + | #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias] + | mapping #[code python -m spacy] to #[code spacy]. +aside("Why python -m?") | The problem with a global entry point is that it's resolved by looking up | entries in your #[code PATH] environment variable. This can give you - | unexpected results, like executing the wrong spaCy installation - | (especially when using #[code virtualenv]). #[code python -m] prevents - | fallbacks to system modules and makes sure the correct spaCy version is - | used. If you hate typing it every time, we recommend creating an - | #[code alias] instead. + | unexpected results, like executing the wrong spaCy installation. + | #[code python -m] prevents fallbacks to system modules. + ++infobox("⚠️ Deprecation note") + | As of spaCy 2.0, the #[code model] command to initialise a model data + | directory is deprecated. The command was only necessary because previous + | versions of spaCy expected a model directory to already be set up. This + | has since been changed, so you can use the #[+api("cli#train") #[code train]] + | command straight away. +h(2, "download") Download @@ -45,7 +52,7 @@ p +cell flag +cell Show help message and available arguments. -+infobox("Important note") ++aside("Downloading best practices") | The #[code download] command is mostly intended as a convenient, | interactive wrapper – it performs compatibility checks and prints | detailed messages in case things go wrong. It's #[strong not recommended] @@ -116,7 +123,6 @@ p +cell Show help message and available arguments. +h(2, "convert") Convert - +tag experimental p | Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] @@ -153,49 +159,7 @@ p +cell flag +cell Show help message and available arguments. -+h(2, "model") Model - +tag experimental - -p - | Initialise a new model and its data directory. For more info on this, see - | the documentation on #[+a("/docs/usage/adding-languages") adding languages]. - -+code(false, "bash"). - python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] - -+table(["Argument", "Type", "Description"]) - +row - +cell #[code lang] - +cell positional - +cell Model language. - - +row - +cell #[code model_dir] - +cell positional - +cell Output directory to store the model in. - - +row - +cell #[code freqs_data] - +cell positional - +cell Tab-separated frequencies file. - - +row - +cell #[code clusters_data] - +cell positional - +cell Brown custers file (optional). - - +row - +cell #[code vectors_data] - +cell positional - +cell Word vectors file (optional). - - +row - +cell #[code --help], #[code -h] - +cell flag - +cell Show help message and available arguments. - +h(2, "train") Train - +tag experimental p | Train a model. Expects data in spaCy's @@ -231,7 +195,7 @@ p +cell Number of iterations (default: #[code 15]). +row - +cell #[code --nsents] + +cell #[code --n_sents], #[code -ns] +cell option +cell Number of sentences (default: #[code 0]). @@ -241,7 +205,7 @@ p +cell L1 regularization penalty for parser (default: #[code 0.0]). +row - +cell #[code --use-gpu], #[code -g] + +cell #[code --use-gpu], #[code -G] +cell flag +cell Use GPU. @@ -266,17 +230,16 @@ p +cell Show help message and available arguments. +h(2, "package") Package - +tag experimental p | Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | from an existing model data directory. All data files are copied over. | If the path to a meta.json is supplied, or a meta.json is found in the | input directory, this file is used. Otherwise, the data can be entered - | directly from the command line. While this feature is still experimental, - | the required file templates are downloaded from - | #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means - | you need to be connected to the internet to use this command. + | directly from the command line. The required file templates are downloaded + | from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make + | sure you're always using the latest versions. This means you need to be + | connected to the internet to use this command. +code(false, "bash"). python -m spacy package [input_dir] [output_dir] [--meta] [--force] diff --git a/website/docs/api/goldcorpus.jade b/website/docs/api/goldcorpus.jade new file mode 100644 index 000000000..bfff92ad5 --- /dev/null +++ b/website/docs/api/goldcorpus.jade @@ -0,0 +1,23 @@ +//- 💫 DOCS > API > GOLDCORPUS + +include ../../_includes/_mixins + +p + | An annotated corpus, using the JSON file format. Manages annotations for + | tagging, dependency parsing and NER. + ++h(2, "init") GoldCorpus.__init__ + +tag method + +p Create a #[code GoldCorpus]. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code train_path] + +cell unicode or #[code Path] + +cell File or directory of training data. + + +row + +cell #[code dev_path] + +cell unicode or #[code Path] + +cell File or directory of development data. diff --git a/website/docs/api/goldparse.jade b/website/docs/api/goldparse.jade index f39558b35..7818912c3 100644 --- a/website/docs/api/goldparse.jade +++ b/website/docs/api/goldparse.jade @@ -7,7 +7,7 @@ p Collection for training annotations. +h(2, "init") GoldParse.__init__ +tag method -p Create a GoldParse. +p Create a #[code GoldParse]. +table(["Name", "Type", "Description"]) +row diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 7f6e0829d..455165bca 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -82,6 +82,41 @@ p +cell #[code Doc] +cell A container for accessing the annotations. ++h(2, "pipe") Language.pipe + +tag method + +p + | Process texts as a stream, and yield #[code Doc] objects in order. + | Supports GIL-free multi-threading. + ++aside-code("Example"). + texts = [u'One document.', u'...', u'Lots of documents'] + for doc in nlp.pipe(texts, batch_size=50, n_threads=4): + assert doc.is_parsed + ++table(["Name", "Type", "Description"]) + +row + +cell #[code texts] + +cell - + +cell A sequence of unicode objects. + + +row + +cell #[code n_threads] + +cell int + +cell + | The number of worker threads to use. If #[code -1], OpenMP will + | decide how many to use at run time. Default is #[code 2]. + + +row + +cell #[code batch_size] + +cell int + +cell The number of texts to buffer. + + +footrow + +cell yields + +cell #[code Doc] + +cell Documents in the order of the original text. + +h(2, "update") Language.update +tag method @@ -172,40 +207,23 @@ p +cell - +cell Config parameters. -+h(2, "pipe") Language.pipe - +tag method ++h(2, "preprocess_gold") Language.preprocess_gold p - | Process texts as a stream, and yield #[code Doc] objects in order. - | Supports GIL-free multi-threading. + | Can be called before training to pre-process gold data. By default, it + | handles nonprojectivity and adds missing tags to the tag map. -+aside-code("Example"). - texts = [u'One document.', u'...', u'Lots of documents'] - for doc in nlp.pipe(texts, batch_size=50, n_threads=4): - assert doc.is_parsed +table(["Name", "Type", "Description"]) +row - +cell #[code texts] - +cell - - +cell A sequence of unicode objects. - - +row - +cell #[code n_threads] - +cell int - +cell - | The number of worker threads to use. If #[code -1], OpenMP will - | decide how many to use at run time. Default is #[code 2]. - - +row - +cell #[code batch_size] - +cell int - +cell The number of texts to buffer. + +cell #[code docs_golds] + +cell iterable + +cell Tuples of #[code Doc] and #[code GoldParse] objects. +footrow +cell yields - +cell #[code Doc] - +cell Documents in the order of the original text. + +cell tuple + +cell Tuples of #[code Doc] and #[code GoldParse] objects. +h(2, "to_disk") Language.to_disk +tag method