Merge remote-tracking branch 'origin/develop'

2025-11-10 12:58:01 +03:00 · 2017-04-07 17:20:09 +02:00 · 2017-04-07 17:20:09 +02:00 · 4a6204dbad
commit 4a6204dbad
parent 0513c43bf0 1f501af602
8 changed files with 248 additions and 16 deletions
--- a/spacy/main.py
+++ b/spacy/main.py
@ -10,12 +10,13 @@ from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
 from spacy.cli import model as cli_model
 from spacy.cli import convert as cli_convert
 class CLI(object):
    """Command-line interface for spaCy"""
-    commands = ('download', 'link', 'info', 'package', 'train', 'model')
+    commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
    @plac.annotations(
        model=("model to download (shortcut or model name)", "positional", None, str),
@ -110,6 +111,20 @@ class CLI(object):
        cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
    @plac.annotations(
        input_file=("input file", "positional", None, str),
        output_dir=("output directory for converted file", "positional", None, str),
        n_sents=("Number of sentences per doc", "option", "n", float),
        morphology=("Enable appending morphology to tags", "flag", "m", bool)
    )
    def convert(self, input_file, output_dir, n_sents=10, morphology=False):
        """
        Convert files into JSON format for use with train command and other
        experiment management functions.
        """
        cli_convert(input_file, output_dir, n_sents, morphology)
    def __missing__(self, name):
        print("\n   Command %r does not exist."
@ -119,6 +134,5 @@ class CLI(object):
 if __name__ == '__main__':
    import plac
    import sys
    cli = CLI()
    sys.argv[0] = 'spacy'
    plac.Interpreter.call(CLI)
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,3 +4,4 @@ from .link import link
 from .package import package
 from .train import train, train_config
 from .model import model
 from .convert import convert
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -0,0 +1,36 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import io
 from pathlib import Path, PurePosixPath
 from .converters import conllu2json
 from .. import util
 # Converters are matched by file extension. To add a converter, add a new entry
 # to this dict with the file extension mapped to the converter function imported
 # from /converters.
 CONVERTERS = {
    '.conllu': conllu2json
 }
 def convert(input_file, output_dir, *args):
    input_path = Path(input_file)
    output_path = Path(output_dir)
    check_dirs(input_path, output_path)
    file_ext = input_path.suffix
    if file_ext in CONVERTERS:
        CONVERTERS[file_ext](input_path, output_path, *args)
    else:
        util.sys_exit("Can't find converter for {}".format(input_path.parts[-1]),
                      title="Unknown format")
 def check_dirs(input_file, output_path):
    if not input_file.exists():
        util.sys_exit(input_file.as_posix(), title="Input file not found")
    if not output_path.exists():
        util.sys_exit(output_path.as_posix(), title="Output directory not found")
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -0,0 +1 @@
 from .conllu2json import conllu2json
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -0,0 +1,90 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import json
 from ...gold import read_json_file, merge_sents
 from ... import util
 def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
    """Convert conllu files into JSON format for use with train cli.
    use_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.
    """
    # by @dvsrepo, via #11 explosion/spacy-dev-resources
    docs = []
    sentences = []
    conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
    for i, (raw_text, tokens) in enumerate(conll_tuples):
        sentence, brackets = tokens[0]
        sentences.append(generate_sentence(sentence))
        # Real-sized documents could be extracted using the comments on the
        # conluu document
        if(len(sentences) % n_sents == 0):
            doc = create_doc(sentences, i)
            docs.append(doc)
            sentences = []
    output_filename = input_path.parts[-1].replace(".conllu", ".json")
    output_file = output_path / output_filename
    json.dump(docs, output_file.open('w', encoding='utf-8'), indent=2)
    util.print_msg("Created {} documents".format(len(docs)),
                   title="Generated output file {}".format(output_file))
 def read_conllx(input_path, use_morphology=False, n=0):
    text = input_path.open('r', encoding='utf-8').read()
    i = 0
    for sent in text.strip().split('\n\n'):
        lines = sent.strip().split('\n')
        if lines:
            while lines[0].startswith('#'):
                lines.pop(0)
            tokens = []
            for line in lines:
                id_, word, lemma, pos, tag, morph, head, dep, _1, \
                _2 = line.split('\t')
                if '-' in id_ or '.' in id_:
                    continue
                try:
                    id_ = int(id_) - 1
                    head = (int(head) - 1) if head != '0' else id_
                    dep = 'ROOT' if dep == 'root' else dep
                    tag = pos+'__'+morph  if use_morphology else pos
                    tokens.append((id_, word, tag, head, dep, 'O'))
                except:
                    print(line)
                    raise
            tuples = [list(t) for t in zip(*tokens)]
            yield (None, [[tuples, []]])
            i += 1
            if n >= 1 and i >= n:
                break
 def generate_sentence(sent):
    (id_, word, tag, head, dep, _) = sent
    sentence = {}
    tokens = []
    for i, id in enumerate(id_):
        token = {}
        token["orth"] = word[id]
        token["tag"] = tag[id]
        token["head"] = head[id] - i
        token["dep"] = dep[id]
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence
 def create_doc(sentences,id):
    doc = {}
    paragraph = {}
    doc["id"] = id
    doc["paragraphs"] = []
    paragraph["sentences"] = sentences
    doc["paragraphs"].append(paragraph)
    return doc
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function
 import json
 from pathlib import Path
--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -104,35 +104,77 @@ p
        +cell flag
        +cell Show help message and available arguments.
-+h(2, "package") Package
+h(2, "convert") Convert
    +tag experimental
 p
-    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
+    |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
-    |  from an existing model data directory. All data files are copied over,
+    |  for use with the #[code train] command and other experiment management
-    |  and the meta data can be entered directly from the command line. While
+    |  functions. The right converter is chosen based on the file extension of
-    |  this feature is still experimental, the required file templates are
+    |  the input file. Currently only supports #[code .conllu].
    |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
    |  This means you need to be connected to the internet to use this command.
 +code(false, "bash").
-    python -m spacy package [input_dir] [output_dir] [--force]
+    python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]
 +table(["Argument", "Type", "Description"])
    +row
-        +cell #[code input_dir]
+        +cell #[code input_file]
        +cell positional
-        +cell Path to directory containing model data.
+        +cell Input file.
    +row
        +cell #[code output_dir]
        +cell positional
-        +cell Directory to create package folder in.
+        +cell Output directory for converted JSON file.
    +row
-        +cell #[code --force], #[code -f]
+        +cell #[code --n_sents], #[code -n]
        +cell option
        +cell Number of sentences per document.
    +row
        +cell #[code --morphology], #[code -m]
        +cell option
        +cell Enable appending morphology to tags.
    +row
        +cell #[code --help], #[code -h]
        +cell flag
-        +cell Force overwriting of existing folder in output directory.
+        +cell Show help message and available arguments.
 +h(2, "model") Model
    +tag experimental
 p Initialise a new model and its data directory.
 +code(false, "bash").
    python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code lang]
        +cell positional
        +cell Model language.
    +row
        +cell #[code model_dir]
        +cell positional
        +cell Output directory to store the model in.
    +row
        +cell #[code freqs_data]
        +cell positional
        +cell Tab-separated frequencies file.
    +row
        +cell #[code clusters_data]
        +cell positional
        +cell Brown custers file (optional).
    +row
        +cell #[code vectors_data]
        +cell positional
        +cell Word vectors file (optional).
    +row
        +cell #[code --help], #[code -h]
@ -199,3 +241,38 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
 +h(2, "package") Package
    +tag experimental
 p
    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
    |  from an existing model data directory. All data files are copied over,
    |  and the meta data can be entered directly from the command line. While
    |  this feature is still experimental, the required file templates are
    |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
    |  This means you need to be connected to the internet to use this command.
 +code(false, "bash").
    python -m spacy package [input_dir] [output_dir] [--force]
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code input_dir]
        +cell positional
        +cell Path to directory containing model data.
    +row
        +cell #[code output_dir]
        +cell positional
        +cell Directory to create package folder in.
    +row
        +cell #[code --force], #[code -f]
        +cell flag
        +cell Force overwriting of existing folder in output directory.
    +row
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
--- a/website/docs/usage/troubleshooting.jade
+++ b/website/docs/usage/troubleshooting.jade
@ -137,6 +137,20 @@ p
    |  #[code python -m spacy download en]. For more info on this, see the
    |  #[+a("/docs/usage/cli") CLI documentation].
 +h(3, "module-load") 'module' object has no attribute 'load'
 +code(false, "text").
    AttributeError: 'module' object has no attribute 'load'
 p
    |  While this could technically have many causes, including spaCy being
    |  broken, the most likely one is that your script's file or directory name
    |  is "shadowing" the module – e.g. your file is called #[code spacy.py],
    |  or a directory you're importing from is called #[code spacy].
 +infobox("Solution")
    |  When using spaCy, never call anything else #[code spacy].
 +h(2, "usage") Using spaCy
 +h(3, "pos-lemma-number") POS tag or lemma is returned as number