mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						89ebc5c3cd
					
				| 
						 | 
				
			
			@ -3,127 +3,21 @@ from __future__ import print_function
 | 
			
		|||
# NB! This breaks in plac on Python 2!!
 | 
			
		||||
#from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
from spacy.cli import download as cli_download
 | 
			
		||||
from spacy.cli import link as cli_link
 | 
			
		||||
from spacy.cli import info as cli_info
 | 
			
		||||
from spacy.cli import package as cli_package
 | 
			
		||||
from spacy.cli import train as cli_train
 | 
			
		||||
from spacy.cli import model as cli_model
 | 
			
		||||
from spacy.cli import convert as cli_convert
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    model=("model to download (shortcut or model name)", "positional", None, str),
 | 
			
		||||
    direct=("force direct download. Needs model name with version and won't "
 | 
			
		||||
            "perform compatibility check", "flag", "d", bool)
 | 
			
		||||
)
 | 
			
		||||
def download(model, direct=False):
 | 
			
		||||
    """
 | 
			
		||||
    Download compatible model from default download path using pip. Model
 | 
			
		||||
    can be shortcut, model name or, if --direct flag is set, full model name
 | 
			
		||||
    with version.
 | 
			
		||||
    """
 | 
			
		||||
    cli_download(model, direct)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    origin=("package name or local path to model", "positional", None, str),
 | 
			
		||||
    link_name=("name of shortuct link to create", "positional", None, str),
 | 
			
		||||
    force=("force overwriting of existing link", "flag", "f", bool)
 | 
			
		||||
)
 | 
			
		||||
def link(origin, link_name, force=False):
 | 
			
		||||
    """
 | 
			
		||||
    Create a symlink for models within the spacy/data directory. Accepts
 | 
			
		||||
    either the name of a pip package, or the local path to the model data
 | 
			
		||||
    directory. Linking models allows loading them via spacy.load(link_name).
 | 
			
		||||
    """
 | 
			
		||||
    cli_link(origin, link_name, force)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    model=("optional: shortcut link of model", "positional", None, str),
 | 
			
		||||
    markdown=("generate Markdown for GitHub issues", "flag", "md", str)
 | 
			
		||||
)
 | 
			
		||||
def info(model=None, markdown=False):
 | 
			
		||||
    """
 | 
			
		||||
    Print info about spaCy installation. If a model shortcut link is
 | 
			
		||||
    speficied as an argument, print model information. Flag --markdown
 | 
			
		||||
    prints details in Markdown for easy copy-pasting to GitHub issues.
 | 
			
		||||
    """
 | 
			
		||||
    cli_info(model, markdown)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    input_dir=("directory with model data", "positional", None, str),
 | 
			
		||||
    output_dir=("output parent directory", "positional", None, str),
 | 
			
		||||
    meta=("path to meta.json", "option", "m", str),
 | 
			
		||||
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 | 
			
		||||
)
 | 
			
		||||
def package(input_dir, output_dir, meta=None, force=False):
 | 
			
		||||
    """
 | 
			
		||||
    Generate Python package for model data, including meta and required
 | 
			
		||||
    installation files. A new directory will be created in the specified
 | 
			
		||||
    output directory, and model data will be copied over.
 | 
			
		||||
    """
 | 
			
		||||
    cli_package(input_dir, output_dir, meta, force)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    input_file=("input file", "positional", None, str),
 | 
			
		||||
    output_dir=("output directory for converted file", "positional", None, str),
 | 
			
		||||
    n_sents=("Number of sentences per doc", "option", "n", float),
 | 
			
		||||
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 | 
			
		||||
)
 | 
			
		||||
def convert(input_file, output_dir, n_sents=10, morphology=False):
 | 
			
		||||
    """
 | 
			
		||||
    Convert files into JSON format for use with train command and other
 | 
			
		||||
    experiment management functions.
 | 
			
		||||
    """
 | 
			
		||||
    cli_convert(input_file, output_dir, n_sents, morphology)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    lang=("model language", "positional", None, str),
 | 
			
		||||
    output_dir=("output directory to store model in", "positional", None, str),
 | 
			
		||||
    train_data=("location of JSON-formatted training data", "positional", None, str),
 | 
			
		||||
    dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
 | 
			
		||||
    n_iter=("number of iterations", "option", "n", int),
 | 
			
		||||
    nsents=("number of sentences", "option", None, int),
 | 
			
		||||
    use_gpu=("Use GPU", "flag", "g", bool),
 | 
			
		||||
    no_tagger=("Don't train tagger", "flag", "T", bool),
 | 
			
		||||
    no_parser=("Don't train parser", "flag", "P", bool),
 | 
			
		||||
    no_entities=("Don't train NER", "flag", "N", bool)
 | 
			
		||||
)
 | 
			
		||||
def train(lang, output_dir, train_data, dev_data=None, n_iter=15,
 | 
			
		||||
          nsents=0, use_gpu=False,
 | 
			
		||||
          no_tagger=False, no_parser=False, no_entities=False):
 | 
			
		||||
    """
 | 
			
		||||
    Train a model. Expects data in spaCy's JSON format.
 | 
			
		||||
    """
 | 
			
		||||
    nsents = nsents or None
 | 
			
		||||
    cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents,
 | 
			
		||||
              use_gpu, no_tagger, no_parser, no_entities)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    import plac
 | 
			
		||||
    import sys
 | 
			
		||||
    commands = {
 | 
			
		||||
        'train': train,
 | 
			
		||||
        'convert': convert,
 | 
			
		||||
        'download': download,
 | 
			
		||||
        'link': link,
 | 
			
		||||
        'info': info,
 | 
			
		||||
        'package': package,
 | 
			
		||||
    }
 | 
			
		||||
    from spacy.cli import download, link, info, package, train, convert
 | 
			
		||||
    from spacy.util import prints
 | 
			
		||||
 | 
			
		||||
    commands = {'download': download, 'link': link, 'info': info, 'train': train,
 | 
			
		||||
                'convert': convert, 'package': package}
 | 
			
		||||
    if len(sys.argv) == 1:
 | 
			
		||||
        print("Available commands: %s" % ', '.join(sorted(commands)))
 | 
			
		||||
        sys.exit(1)
 | 
			
		||||
        prints(', '.join(commands), title="Available commands", exits=1)
 | 
			
		||||
    command = sys.argv.pop(1)
 | 
			
		||||
    sys.argv[0] = 'spacy %s' % command
 | 
			
		||||
    if command in commands:
 | 
			
		||||
        plac.call(commands[command])
 | 
			
		||||
    else:
 | 
			
		||||
        print("Unknown command: %s. Available: %s" % (command, ', '.join(commands)))
 | 
			
		||||
        sys.exit(1)
 | 
			
		||||
        prints("Available: %s" % ', '.join(commands),
 | 
			
		||||
               title="Unknown command: %s" % command, exits=1)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -3,5 +3,4 @@ from .info import info
 | 
			
		|||
from .link import link
 | 
			
		||||
from .package import package
 | 
			
		||||
from .train import train
 | 
			
		||||
from .model import model
 | 
			
		||||
from .convert import convert
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from .converters import conllu2json, iob2json
 | 
			
		||||
| 
						 | 
				
			
			@ -18,15 +19,24 @@ CONVERTERS = {
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def convert(input_file, output_dir, *args):
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    input_file=("input file", "positional", None, str),
 | 
			
		||||
    output_dir=("output directory for converted file", "positional", None, str),
 | 
			
		||||
    n_sents=("Number of sentences per doc", "option", "n", float),
 | 
			
		||||
    morphology=("Enable appending morphology to tags", "flag", "m", bool)
 | 
			
		||||
)
 | 
			
		||||
def convert(input_file, output_dir, n_sents, morphology):
 | 
			
		||||
    """Convert files into JSON format for use with train command and other
 | 
			
		||||
    experiment management functions.
 | 
			
		||||
    """
 | 
			
		||||
    input_path = Path(input_file)
 | 
			
		||||
    output_path = Path(output_dir)
 | 
			
		||||
    if not input_path.exists():
 | 
			
		||||
        prints(input_path, title="Input file not found", exits=True)
 | 
			
		||||
        prints(input_path, title="Input file not found", exits=1)
 | 
			
		||||
    if not output_path.exists():
 | 
			
		||||
        prints(output_path, title="Output directory not found", exits=True)
 | 
			
		||||
        prints(output_path, title="Output directory not found", exits=1)
 | 
			
		||||
    file_ext = input_path.suffix
 | 
			
		||||
    if not file_ext in CONVERTERS:
 | 
			
		||||
        prints("Can't find converter for %s" % input_path.parts[-1],
 | 
			
		||||
               title="Unknown format", exits=True)
 | 
			
		||||
               title="Unknown format", exits=1)
 | 
			
		||||
    CONVERTERS[file_ext](input_path, output_path, *args)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
import requests
 | 
			
		||||
import os
 | 
			
		||||
import subprocess
 | 
			
		||||
| 
						 | 
				
			
			@ -11,7 +12,16 @@ from ..util import prints
 | 
			
		|||
from .. import about
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    model=("model to download (shortcut or model name)", "positional", None, str),
 | 
			
		||||
    direct=("force direct download. Needs model name with version and won't "
 | 
			
		||||
            "perform compatibility check", "flag", "d", bool)
 | 
			
		||||
)
 | 
			
		||||
def download(model, direct=False):
 | 
			
		||||
    """Download compatible model from default download path using pip. Model
 | 
			
		||||
    can be shortcut, model name or, if --direct flag is set, full model name
 | 
			
		||||
    with version.
 | 
			
		||||
    """
 | 
			
		||||
    if direct:
 | 
			
		||||
        download_model('{m}/{m}.tar.gz'.format(m=model))
 | 
			
		||||
    else:
 | 
			
		||||
| 
						 | 
				
			
			@ -38,7 +48,7 @@ def get_json(url, desc):
 | 
			
		|||
    if r.status_code != 200:
 | 
			
		||||
        prints("Couldn't fetch %s. Please find a model for your spaCy installation "
 | 
			
		||||
               "(v%s), and download it manually." % (desc, about.__version__),
 | 
			
		||||
               about.__docs_models__, title="Server error (%d)" % r.status_code, exits=True)
 | 
			
		||||
               about.__docs_models__, title="Server error (%d)" % r.status_code, exits=1)
 | 
			
		||||
    return r.json()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -48,7 +58,7 @@ def get_compatibility():
 | 
			
		|||
    comp = comp_table['spacy']
 | 
			
		||||
    if version not in comp:
 | 
			
		||||
        prints("No compatible models found for v%s of spaCy." % version,
 | 
			
		||||
               title="Compatibility error", exits=True)
 | 
			
		||||
               title="Compatibility error", exits=1)
 | 
			
		||||
    return comp[version]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -56,7 +66,7 @@ def get_version(model, comp):
 | 
			
		|||
    if model not in comp:
 | 
			
		||||
        version = about.__version__
 | 
			
		||||
        prints("No compatible model found for '%s' (spaCy v%s)." % (model, version),
 | 
			
		||||
               title="Compatibility error", exits=True)
 | 
			
		||||
               title="Compatibility error", exits=1)
 | 
			
		||||
    return comp[model][0]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
import platform
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -9,7 +10,15 @@ from .. import about
 | 
			
		|||
from .. import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    model=("optional: shortcut link of model", "positional", None, str),
 | 
			
		||||
    markdown=("generate Markdown for GitHub issues", "flag", "md", str)
 | 
			
		||||
)
 | 
			
		||||
def info(model=None, markdown=False):
 | 
			
		||||
    """Print info about spaCy installation. If a model shortcut link is
 | 
			
		||||
    speficied as an argument, print model information. Flag --markdown
 | 
			
		||||
    prints details in Markdown for easy copy-pasting to GitHub issues.
 | 
			
		||||
    """
 | 
			
		||||
    if model:
 | 
			
		||||
        model_path = util.resolve_model_path(model)
 | 
			
		||||
        meta = util.parse_package_meta(model_path)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,24 +1,35 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from ..compat import symlink_to, path2str
 | 
			
		||||
from ..util import prints
 | 
			
		||||
from .. import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    origin=("package name or local path to model", "positional", None, str),
 | 
			
		||||
    link_name=("name of shortuct link to create", "positional", None, str),
 | 
			
		||||
    force=("force overwriting of existing link", "flag", "f", bool)
 | 
			
		||||
)
 | 
			
		||||
def link(origin, link_name, force=False):
 | 
			
		||||
    """Create a symlink for models within the spacy/data directory. Accepts
 | 
			
		||||
    either the name of a pip package, or the local path to the model data
 | 
			
		||||
    directory. Linking models allows loading them via spacy.load(link_name).
 | 
			
		||||
    """
 | 
			
		||||
    if util.is_package(origin):
 | 
			
		||||
        model_path = util.get_model_package_path(origin)
 | 
			
		||||
    else:
 | 
			
		||||
        model_path = Path(origin)
 | 
			
		||||
    if not model_path.exists():
 | 
			
		||||
        prints("The data should be located in %s" % path2str(model_path),
 | 
			
		||||
               title="Can't locate model data", exits=True)
 | 
			
		||||
               title="Can't locate model data", exits=1)
 | 
			
		||||
    link_path = util.get_data_path() / link_name
 | 
			
		||||
    if link_path.exists() and not force:
 | 
			
		||||
        prints("To overwrite an existing link, use the --force flag.",
 | 
			
		||||
               title="Link %s already exists" % link_name, exits=True)
 | 
			
		||||
               title="Link %s already exists" % link_name, exits=1)
 | 
			
		||||
    elif link_path.exists():
 | 
			
		||||
        link_path.unlink()
 | 
			
		||||
    try:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,122 +0,0 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import gzip
 | 
			
		||||
import math
 | 
			
		||||
from ast import literal_eval
 | 
			
		||||
from preshed.counter import PreshCounter
 | 
			
		||||
 | 
			
		||||
from ..vocab import write_binary_vectors
 | 
			
		||||
from ..compat import fix_text, path2str
 | 
			
		||||
from ..util import prints
 | 
			
		||||
from .. import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def model(lang, model_dir, freqs_data, clusters_data, vectors_data):
 | 
			
		||||
    model_path = util.ensure_path(model_dir)
 | 
			
		||||
    freqs_path = util.ensure_path(freqs_data)
 | 
			
		||||
    clusters_path = util.ensure_path(clusters_data)
 | 
			
		||||
    vectors_path = util.ensure_path(vectors_data)
 | 
			
		||||
    if not freqs_path.is_file():
 | 
			
		||||
        prints(freqs_path, title="No frequencies file found", exits=True)
 | 
			
		||||
    if clusters_path and not clusters_path.is_file():
 | 
			
		||||
        prints(clusters_path, title="No Brown clusters file found", exits=True)
 | 
			
		||||
    if vectors_path and not vectors_path.is_file():
 | 
			
		||||
        prints(vectors_path, title="No word vectors file found", exits=True)
 | 
			
		||||
    vocab = util.get_lang_class(lang).Defaults.create_vocab()
 | 
			
		||||
    probs, oov_prob = read_probs(freqs_path)
 | 
			
		||||
    clusters = read_clusters(clusters_path) if clusters_path else {}
 | 
			
		||||
    populate_vocab(vocab, clusters, probs, oov_prob)
 | 
			
		||||
    create_model(model_path, vectors_path, vocab, oov_prob)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def create_model(model_path, vectors_path, vocab, oov_prob):
 | 
			
		||||
    vocab_path = model_path / 'vocab'
 | 
			
		||||
    lexemes_path = vocab_path / 'lexemes.bin'
 | 
			
		||||
    strings_path = vocab_path / 'strings.json'
 | 
			
		||||
    oov_path = vocab_path / 'oov_prob'
 | 
			
		||||
 | 
			
		||||
    if not model_path.exists():
 | 
			
		||||
        model_path.mkdir()
 | 
			
		||||
    if not vocab_path.exists():
 | 
			
		||||
        vocab_path.mkdir()
 | 
			
		||||
    vocab.dump(path2str(lexemes_path))
 | 
			
		||||
    with strings_path.open('w') as f:
 | 
			
		||||
        vocab.strings.dump(f)
 | 
			
		||||
    with oov_path.open('w') as f:
 | 
			
		||||
        f.write('%f' % oov_prob)
 | 
			
		||||
    if vectors_path:
 | 
			
		||||
        vectors_dest = vocab_path / 'vec.bin'
 | 
			
		||||
        write_binary_vectors(path2str(vectors_path), path2str(vectors_dest))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200):
 | 
			
		||||
    counts = PreshCounter()
 | 
			
		||||
    total = 0
 | 
			
		||||
    freqs_file = check_unzip(freqs_path)
 | 
			
		||||
    for i, line in enumerate(freqs_file):
 | 
			
		||||
        freq, doc_freq, key = line.rstrip().split('\t', 2)
 | 
			
		||||
        freq = int(freq)
 | 
			
		||||
        counts.inc(i+1, freq)
 | 
			
		||||
        total += freq
 | 
			
		||||
    counts.smooth()
 | 
			
		||||
    log_total = math.log(total)
 | 
			
		||||
    freqs_file = check_unzip(freqs_path)
 | 
			
		||||
    probs = {}
 | 
			
		||||
    for line in freqs_file:
 | 
			
		||||
        freq, doc_freq, key = line.rstrip().split('\t', 2)
 | 
			
		||||
        doc_freq = int(doc_freq)
 | 
			
		||||
        freq = int(freq)
 | 
			
		||||
        if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
 | 
			
		||||
            word = literal_eval(key)
 | 
			
		||||
            smooth_count = counts.smoother(int(freq))
 | 
			
		||||
            probs[word] = math.log(smooth_count) - log_total
 | 
			
		||||
    oov_prob = math.log(counts.smoother(0)) - log_total
 | 
			
		||||
    return probs, oov_prob
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def read_clusters(clusters_path):
 | 
			
		||||
    clusters = {}
 | 
			
		||||
    with clusters_path.open() as f:
 | 
			
		||||
        for line in f:
 | 
			
		||||
            try:
 | 
			
		||||
                cluster, word, freq = line.split()
 | 
			
		||||
                word = fix_text(word)
 | 
			
		||||
            except ValueError:
 | 
			
		||||
                continue
 | 
			
		||||
            # If the clusterer has only seen the word a few times, its
 | 
			
		||||
            # cluster is unreliable.
 | 
			
		||||
            if int(freq) >= 3:
 | 
			
		||||
                clusters[word] = cluster
 | 
			
		||||
            else:
 | 
			
		||||
                clusters[word] = '0'
 | 
			
		||||
    # Expand clusters with re-casing
 | 
			
		||||
    for word, cluster in list(clusters.items()):
 | 
			
		||||
        if word.lower() not in clusters:
 | 
			
		||||
            clusters[word.lower()] = cluster
 | 
			
		||||
        if word.title() not in clusters:
 | 
			
		||||
            clusters[word.title()] = cluster
 | 
			
		||||
        if word.upper() not in clusters:
 | 
			
		||||
            clusters[word.upper()] = cluster
 | 
			
		||||
    return clusters
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def populate_vocab(vocab, clusters, probs, oov_prob):
 | 
			
		||||
    for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
 | 
			
		||||
        lexeme = vocab[word]
 | 
			
		||||
        lexeme.prob = prob
 | 
			
		||||
        lexeme.is_oov = False
 | 
			
		||||
        # Decode as a little-endian string, so that we can do & 15 to get
 | 
			
		||||
        # the first 4 bits. See _parse_features.pyx
 | 
			
		||||
        if word in clusters:
 | 
			
		||||
            lexeme.cluster = int(clusters[word][::-1], 2)
 | 
			
		||||
        else:
 | 
			
		||||
            lexeme.cluster = 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def check_unzip(file_path):
 | 
			
		||||
    file_path_str = path2str(file_path)
 | 
			
		||||
    if file_path_str.endswith('gz'):
 | 
			
		||||
        return gzip.open(file_path_str)
 | 
			
		||||
    else:
 | 
			
		||||
        return file_path.open()
 | 
			
		||||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
import shutil
 | 
			
		||||
import requests
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
| 
						 | 
				
			
			@ -11,16 +12,26 @@ from .. import util
 | 
			
		|||
from .. import about
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def package(input_dir, output_dir, meta_path, force):
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    input_dir=("directory with model data", "positional", None, str),
 | 
			
		||||
    output_dir=("output parent directory", "positional", None, str),
 | 
			
		||||
    meta=("path to meta.json", "option", "m", str),
 | 
			
		||||
    force=("force overwriting of existing folder in output directory", "flag", "f", bool)
 | 
			
		||||
)
 | 
			
		||||
def package(input_dir, output_dir, meta, force):
 | 
			
		||||
    """Generate Python package for model data, including meta and required
 | 
			
		||||
    installation files. A new directory will be created in the specified
 | 
			
		||||
    output directory, and model data will be copied over.
 | 
			
		||||
    """
 | 
			
		||||
    input_path = util.ensure_path(input_dir)
 | 
			
		||||
    output_path = util.ensure_path(output_dir)
 | 
			
		||||
    meta_path = util.ensure_path(meta_path)
 | 
			
		||||
    meta_path = util.ensure_path(meta)
 | 
			
		||||
    if not input_path or not input_path.exists():
 | 
			
		||||
        prints(input_path, title="Model directory not found", exits=True)
 | 
			
		||||
        prints(input_path, title="Model directory not found", exits=1)
 | 
			
		||||
    if not output_path or not output_path.exists():
 | 
			
		||||
        prints(output_path, title="Output directory not found", exits=True)
 | 
			
		||||
        prints(output_path, title="Output directory not found", exits=1)
 | 
			
		||||
    if meta_path and not meta_path.exists():
 | 
			
		||||
        prints(meta_path, title="meta.json not found", exits=True)
 | 
			
		||||
        prints(meta_path, title="meta.json not found", exits=1)
 | 
			
		||||
 | 
			
		||||
    template_setup = get_template('setup.py')
 | 
			
		||||
    template_manifest = get_template('MANIFEST.in')
 | 
			
		||||
| 
						 | 
				
			
			@ -55,7 +66,7 @@ def create_dirs(package_path, force):
 | 
			
		|||
        else:
 | 
			
		||||
            prints(package_path, "Please delete the directory and try again, or "
 | 
			
		||||
                   "use the --force flag to overwrite existing directories.",
 | 
			
		||||
                   title="Package directory already exists", exits=True)
 | 
			
		||||
                   title="Package directory already exists", exits=1)
 | 
			
		||||
    Path.mkdir(package_path, parents=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -87,12 +98,12 @@ def validate_meta(meta, keys):
 | 
			
		|||
    for key in keys:
 | 
			
		||||
        if key not in meta or meta[key] == '':
 | 
			
		||||
            prints("This setting is required to build your package.",
 | 
			
		||||
                   title='No "%s" setting found in meta.json' % key, exits=True)
 | 
			
		||||
                   title='No "%s" setting found in meta.json' % key, exits=1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_template(filepath):
 | 
			
		||||
    r = requests.get(about.__model_files__ + filepath)
 | 
			
		||||
    if r.status_code != 200:
 | 
			
		||||
        prints("Couldn't fetch template files from GitHub.",
 | 
			
		||||
               title="Server error (%d)" % r.status_code, exits=True)
 | 
			
		||||
               title="Server error (%d)" % r.status_code, exits=1)
 | 
			
		||||
    return r.text
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
# coding: utf8
 | 
			
		||||
from __future__ import unicode_literals, division, print_function
 | 
			
		||||
 | 
			
		||||
import plac
 | 
			
		||||
import json
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
import cytoolz
 | 
			
		||||
| 
						 | 
				
			
			@ -18,19 +19,33 @@ from .. import util
 | 
			
		|||
from .. import displacy
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def train(lang_id, output_dir, train_data, dev_data, n_iter, n_sents,
 | 
			
		||||
@plac.annotations(
 | 
			
		||||
    lang=("model language", "positional", None, str),
 | 
			
		||||
    output_dir=("output directory to store model in", "positional", None, str),
 | 
			
		||||
    train_data=("location of JSON-formatted training data", "positional", None, str),
 | 
			
		||||
    dev_data=("location of JSON-formatted development data (optional)", "positional", None, str),
 | 
			
		||||
    n_iter=("number of iterations", "option", "n", int),
 | 
			
		||||
    n_sents=("number of sentences", "option", "ns", int),
 | 
			
		||||
    use_gpu=("Use GPU", "flag", "G", bool),
 | 
			
		||||
    no_tagger=("Don't train tagger", "flag", "T", bool),
 | 
			
		||||
    no_parser=("Don't train parser", "flag", "P", bool),
 | 
			
		||||
    no_entities=("Don't train NER", "flag", "N", bool)
 | 
			
		||||
)
 | 
			
		||||
def train(lang, output_dir, train_data, dev_data, n_iter, n_sents,
 | 
			
		||||
          use_gpu, no_tagger, no_parser, no_entities):
 | 
			
		||||
    """Train a model. Expects data in spaCy's JSON format."""
 | 
			
		||||
    n_sents = n_sents or None
 | 
			
		||||
    output_path = util.ensure_path(output_dir)
 | 
			
		||||
    train_path = util.ensure_path(train_data)
 | 
			
		||||
    dev_path = util.ensure_path(dev_data)
 | 
			
		||||
    if not output_path.exists():
 | 
			
		||||
        prints(output_path, title="Output directory not found", exits=True)
 | 
			
		||||
        prints(output_path, title="Output directory not found", exits=1)
 | 
			
		||||
    if not train_path.exists():
 | 
			
		||||
        prints(train_path, title="Training data not found", exits=True)
 | 
			
		||||
        prints(train_path, title="Training data not found", exits=1)
 | 
			
		||||
    if dev_path and not dev_path.exists():
 | 
			
		||||
        prints(dev_path, title="Development data not found", exits=True)
 | 
			
		||||
        prints(dev_path, title="Development data not found", exits=1)
 | 
			
		||||
 | 
			
		||||
    lang_class = util.get_lang_class(lang_id)
 | 
			
		||||
    lang_class = util.get_lang_class(lang)
 | 
			
		||||
 | 
			
		||||
    pipeline = ['token_vectors', 'tags', 'dependencies', 'entities']
 | 
			
		||||
    if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -142,9 +142,14 @@ def _min_edit_path(cand_words, gold_words):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
class GoldCorpus(object):
 | 
			
		||||
    '''An annotated corpus, using the JSON file format. Manages
 | 
			
		||||
    annotations for tagging, dependency parsing, NER.'''
 | 
			
		||||
    """An annotated corpus, using the JSON file format. Manages
 | 
			
		||||
    annotations for tagging, dependency parsing and NER."""
 | 
			
		||||
    def __init__(self, train_path, dev_path):
 | 
			
		||||
        """Create a GoldCorpus.
 | 
			
		||||
 | 
			
		||||
        train_path (unicode or Path): File or directory of training data.
 | 
			
		||||
        dev_path (unicode or Path): File or directory of development data.
 | 
			
		||||
        """
 | 
			
		||||
        self.train_path = util.ensure_path(train_path)
 | 
			
		||||
        self.dev_path = util.ensure_path(dev_path)
 | 
			
		||||
        self.train_locs = self.walk_corpus(self.train_path)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -236,6 +236,12 @@ class Language(object):
 | 
			
		|||
            doc.tensor = None
 | 
			
		||||
 | 
			
		||||
    def preprocess_gold(self, docs_golds):
 | 
			
		||||
        """Can be called before training to pre-process gold data. By default,
 | 
			
		||||
        it handles nonprojectivity and adds missing tags to the tag map.
 | 
			
		||||
 | 
			
		||||
        docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
 | 
			
		||||
        YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
 | 
			
		||||
        """
 | 
			
		||||
        for proc in self.pipeline:
 | 
			
		||||
            if hasattr(proc, 'preprocess_gold'):
 | 
			
		||||
                docs_golds = proc.preprocess_gold(docs_golds)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -380,13 +380,13 @@ def prints(*texts, **kwargs):
 | 
			
		|||
    *texts (unicode): Texts to print. Each argument is rendered as paragraph.
 | 
			
		||||
    **kwargs: 'title' becomes coloured headline. 'exits'=True performs sys exit.
 | 
			
		||||
    """
 | 
			
		||||
    exits = kwargs.get('exits', False)
 | 
			
		||||
    exits = kwargs.get('exits', None)
 | 
			
		||||
    title = kwargs.get('title', None)
 | 
			
		||||
    title = '\033[93m{}\033[0m\n'.format(_wrap(title)) if title else ''
 | 
			
		||||
    message = '\n\n'.join([_wrap(text) for text in texts])
 | 
			
		||||
    print('\n{}{}\n'.format(title, message))
 | 
			
		||||
    if exits:
 | 
			
		||||
        sys.exit(0)
 | 
			
		||||
    if exits is not None:
 | 
			
		||||
        sys.exit(exits)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _wrap(text, wrap_max=80, indent=4):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -23,7 +23,8 @@
 | 
			
		|||
            "Lexeme": "lexeme",
 | 
			
		||||
            "Vocab": "vocab",
 | 
			
		||||
            "StringStore": "stringstore",
 | 
			
		||||
            "GoldParse": "goldparse"
 | 
			
		||||
            "GoldParse": "goldparse",
 | 
			
		||||
            "GoldCorpus": "goldcorpus"
 | 
			
		||||
        },
 | 
			
		||||
        "Other": {
 | 
			
		||||
            "Annotation Specs": "annotation",
 | 
			
		||||
| 
						 | 
				
			
			@ -135,6 +136,11 @@
 | 
			
		|||
        "tag": "class"
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "goldcorpus": {
 | 
			
		||||
        "title": "GoldCorpus",
 | 
			
		||||
        "tag": "class"
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    "annotation": {
 | 
			
		||||
        "title": "Annotation Specifications"
 | 
			
		||||
    },
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,16 +5,23 @@ include ../../_includes/_mixins
 | 
			
		|||
p
 | 
			
		||||
    |  As of v1.7.0, spaCy comes with new command line helpers to download and
 | 
			
		||||
    |  link models and show useful debugging information. For a list of available
 | 
			
		||||
    |  commands, type #[code python -m spacy --help].
 | 
			
		||||
    |  commands, type #[code python -m spacy]. To make the command even more
 | 
			
		||||
    |  convenient, we recommend
 | 
			
		||||
    |  #[+a("https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537") creating an alias]
 | 
			
		||||
    |  mapping #[code python -m spacy] to #[code spacy].
 | 
			
		||||
 | 
			
		||||
+aside("Why python -m?")
 | 
			
		||||
    |  The problem with a global entry point is that it's resolved by looking up
 | 
			
		||||
    |  entries in your #[code PATH] environment variable. This can give you
 | 
			
		||||
    |  unexpected results, like executing the wrong spaCy installation
 | 
			
		||||
    |  (especially when using #[code virtualenv]). #[code python -m] prevents
 | 
			
		||||
    |  fallbacks to system modules and makes sure the correct spaCy version is
 | 
			
		||||
    |  used. If you hate typing it every time, we recommend creating an
 | 
			
		||||
    |  #[code alias] instead.
 | 
			
		||||
    |  unexpected results, like executing the wrong spaCy installation.
 | 
			
		||||
    |  #[code python -m] prevents fallbacks to system modules.
 | 
			
		||||
 | 
			
		||||
+infobox("⚠️ Deprecation note")
 | 
			
		||||
    |  As of spaCy 2.0, the #[code model] command to initialise a model data
 | 
			
		||||
    |  directory is deprecated. The command was only necessary because previous
 | 
			
		||||
    |  versions of spaCy expected a model directory to already be set up. This
 | 
			
		||||
    |  has since been changed, so you can use the #[+api("cli#train") #[code train]]
 | 
			
		||||
    |  command straight away.
 | 
			
		||||
 | 
			
		||||
+h(2, "download") Download
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -45,7 +52,7 @@ p
 | 
			
		|||
        +cell flag
 | 
			
		||||
        +cell Show help message and available arguments.
 | 
			
		||||
 | 
			
		||||
+infobox("Important note")
 | 
			
		||||
+aside("Downloading best practices")
 | 
			
		||||
    |  The #[code download] command is mostly intended as a convenient,
 | 
			
		||||
    |  interactive wrapper – it performs compatibility checks and prints
 | 
			
		||||
    |  detailed messages in case things go wrong. It's #[strong not recommended]
 | 
			
		||||
| 
						 | 
				
			
			@ -116,7 +123,6 @@ p
 | 
			
		|||
        +cell Show help message and available arguments.
 | 
			
		||||
 | 
			
		||||
+h(2, "convert") Convert
 | 
			
		||||
    +tag experimental
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
 | 
			
		||||
| 
						 | 
				
			
			@ -153,49 +159,7 @@ p
 | 
			
		|||
        +cell flag
 | 
			
		||||
        +cell Show help message and available arguments.
 | 
			
		||||
 | 
			
		||||
+h(2, "model") Model
 | 
			
		||||
    +tag experimental
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Initialise a new model and its data directory. For more info on this, see
 | 
			
		||||
    |  the documentation on #[+a("/docs/usage/adding-languages") adding languages].
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
 | 
			
		||||
 | 
			
		||||
+table(["Argument", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code lang]
 | 
			
		||||
        +cell positional
 | 
			
		||||
        +cell Model language.
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code model_dir]
 | 
			
		||||
        +cell positional
 | 
			
		||||
        +cell Output directory to store the model in.
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code freqs_data]
 | 
			
		||||
        +cell positional
 | 
			
		||||
        +cell Tab-separated frequencies file.
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code clusters_data]
 | 
			
		||||
        +cell positional
 | 
			
		||||
        +cell Brown custers file (optional).
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code vectors_data]
 | 
			
		||||
        +cell positional
 | 
			
		||||
        +cell Word vectors file (optional).
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code --help], #[code -h]
 | 
			
		||||
        +cell flag
 | 
			
		||||
        +cell Show help message and available arguments.
 | 
			
		||||
 | 
			
		||||
+h(2, "train") Train
 | 
			
		||||
    +tag experimental
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Train a model. Expects data in spaCy's
 | 
			
		||||
| 
						 | 
				
			
			@ -231,7 +195,7 @@ p
 | 
			
		|||
        +cell Number of iterations (default: #[code 15]).
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code --nsents]
 | 
			
		||||
        +cell #[code --n_sents], #[code -ns]
 | 
			
		||||
        +cell option
 | 
			
		||||
        +cell Number of sentences (default: #[code 0]).
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -241,7 +205,7 @@ p
 | 
			
		|||
        +cell L1 regularization penalty for parser (default: #[code 0.0]).
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code --use-gpu], #[code -g]
 | 
			
		||||
        +cell #[code --use-gpu], #[code -G]
 | 
			
		||||
        +cell flag
 | 
			
		||||
        +cell Use GPU.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -266,17 +230,16 @@ p
 | 
			
		|||
        +cell Show help message and available arguments.
 | 
			
		||||
 | 
			
		||||
+h(2, "package") Package
 | 
			
		||||
    +tag experimental
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Generate a #[+a("/docs/usage/saving-loading#generating") model Python package]
 | 
			
		||||
    |  from an existing model data directory. All data files are copied over.
 | 
			
		||||
    |  If the path to a meta.json is supplied, or a meta.json is found in the
 | 
			
		||||
    |  input directory, this file is used. Otherwise, the data can be entered
 | 
			
		||||
    |  directly from the command line. While this feature is still experimental,
 | 
			
		||||
    |  the required file templates are downloaded from
 | 
			
		||||
    |  #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means
 | 
			
		||||
    |  you need to be connected to the internet to use this command.
 | 
			
		||||
    |  directly from the command line. The required file templates are downloaded
 | 
			
		||||
    |  from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub] to make
 | 
			
		||||
    |  sure you're always using the latest versions. This means you need to be
 | 
			
		||||
    |  connected to the internet to use this command.
 | 
			
		||||
 | 
			
		||||
+code(false, "bash").
 | 
			
		||||
    python -m spacy package [input_dir] [output_dir] [--meta] [--force]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										23
									
								
								website/docs/api/goldcorpus.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								website/docs/api/goldcorpus.jade
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,23 @@
 | 
			
		|||
//- 💫 DOCS > API > GOLDCORPUS
 | 
			
		||||
 | 
			
		||||
include ../../_includes/_mixins
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  An annotated corpus, using the JSON file format. Manages annotations for
 | 
			
		||||
    |  tagging, dependency parsing and NER.
 | 
			
		||||
 | 
			
		||||
+h(2, "init") GoldCorpus.__init__
 | 
			
		||||
    +tag method
 | 
			
		||||
 | 
			
		||||
p Create a #[code GoldCorpus].
 | 
			
		||||
 | 
			
		||||
+table(["Name", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code train_path]
 | 
			
		||||
        +cell unicode or #[code Path]
 | 
			
		||||
        +cell File or directory of training data.
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code dev_path]
 | 
			
		||||
        +cell unicode or #[code Path]
 | 
			
		||||
        +cell File or directory of development data.
 | 
			
		||||
| 
						 | 
				
			
			@ -7,7 +7,7 @@ p Collection for training annotations.
 | 
			
		|||
+h(2, "init") GoldParse.__init__
 | 
			
		||||
    +tag method
 | 
			
		||||
 | 
			
		||||
p Create a GoldParse.
 | 
			
		||||
p Create a #[code GoldParse].
 | 
			
		||||
 | 
			
		||||
+table(["Name", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -82,6 +82,41 @@ p
 | 
			
		|||
        +cell #[code Doc]
 | 
			
		||||
        +cell A container for accessing the annotations.
 | 
			
		||||
 | 
			
		||||
+h(2, "pipe") Language.pipe
 | 
			
		||||
    +tag method
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Process texts as a stream, and yield #[code Doc] objects in order.
 | 
			
		||||
    |  Supports GIL-free multi-threading.
 | 
			
		||||
 | 
			
		||||
+aside-code("Example").
 | 
			
		||||
    texts = [u'One document.', u'...', u'Lots of documents']
 | 
			
		||||
    for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
 | 
			
		||||
        assert doc.is_parsed
 | 
			
		||||
 | 
			
		||||
+table(["Name", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code texts]
 | 
			
		||||
        +cell -
 | 
			
		||||
        +cell A sequence of unicode objects.
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code n_threads]
 | 
			
		||||
        +cell int
 | 
			
		||||
        +cell
 | 
			
		||||
            |  The number of worker threads to use. If #[code -1], OpenMP will
 | 
			
		||||
            |  decide how many to use at run time. Default is #[code 2].
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code batch_size]
 | 
			
		||||
        +cell int
 | 
			
		||||
        +cell The number of texts to buffer.
 | 
			
		||||
 | 
			
		||||
    +footrow
 | 
			
		||||
        +cell yields
 | 
			
		||||
        +cell #[code Doc]
 | 
			
		||||
        +cell Documents in the order of the original text.
 | 
			
		||||
 | 
			
		||||
+h(2, "update") Language.update
 | 
			
		||||
    +tag method
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -172,40 +207,23 @@ p
 | 
			
		|||
        +cell -
 | 
			
		||||
        +cell Config parameters.
 | 
			
		||||
 | 
			
		||||
+h(2, "pipe") Language.pipe
 | 
			
		||||
    +tag method
 | 
			
		||||
+h(2, "preprocess_gold") Language.preprocess_gold
 | 
			
		||||
 | 
			
		||||
p
 | 
			
		||||
    |  Process texts as a stream, and yield #[code Doc] objects in order.
 | 
			
		||||
    |  Supports GIL-free multi-threading.
 | 
			
		||||
    |  Can be called before training to pre-process gold data. By default, it
 | 
			
		||||
    |  handles nonprojectivity and adds missing tags to the tag map.
 | 
			
		||||
 | 
			
		||||
+aside-code("Example").
 | 
			
		||||
    texts = [u'One document.', u'...', u'Lots of documents']
 | 
			
		||||
    for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
 | 
			
		||||
        assert doc.is_parsed
 | 
			
		||||
 | 
			
		||||
+table(["Name", "Type", "Description"])
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code texts]
 | 
			
		||||
        +cell -
 | 
			
		||||
        +cell A sequence of unicode objects.
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code n_threads]
 | 
			
		||||
        +cell int
 | 
			
		||||
        +cell
 | 
			
		||||
            |  The number of worker threads to use. If #[code -1], OpenMP will
 | 
			
		||||
            |  decide how many to use at run time. Default is #[code 2].
 | 
			
		||||
 | 
			
		||||
    +row
 | 
			
		||||
        +cell #[code batch_size]
 | 
			
		||||
        +cell int
 | 
			
		||||
        +cell The number of texts to buffer.
 | 
			
		||||
        +cell #[code docs_golds]
 | 
			
		||||
        +cell iterable
 | 
			
		||||
        +cell Tuples of #[code Doc] and #[code GoldParse] objects.
 | 
			
		||||
 | 
			
		||||
    +footrow
 | 
			
		||||
        +cell yields
 | 
			
		||||
        +cell #[code Doc]
 | 
			
		||||
        +cell Documents in the order of the original text.
 | 
			
		||||
        +cell tuple
 | 
			
		||||
        +cell Tuples of #[code Doc] and #[code GoldParse] objects.
 | 
			
		||||
 | 
			
		||||
+h(2, "to_disk") Language.to_disk
 | 
			
		||||
    +tag method
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user