Merge remote-tracking branch 'origin/develop'

2025-11-10 04:47:51 +03:00 · 2017-04-07 17:20:09 +02:00 · 2017-04-07 17:20:09 +02:00 · 4a6204dbad
commit 4a6204dbad
parent 0513c43bf0 1f501af602
8 changed files with 248 additions and 16 deletions
--- a/spacy/main.py
+++ b/spacy/main.py
@ -10,12 +10,13 @@ from spacy.cli import info as cli_info
 from spacy.cli import package as cli_package
 from spacy.cli import train as cli_train
 from spacy.cli import model as cli_model
+from spacy.cli import convert as cli_convert


 class CLI(object):
    """Command-line interface for spaCy"""

-    commands = ('download', 'link', 'info', 'package', 'train', 'model')
+    commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')

    @plac.annotations(
        model=("model to download (shortcut or model name)", "positional", None, str),
@ -110,6 +111,20 @@ class CLI(object):

        cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)

+    @plac.annotations(
+        input_file=("input file", "positional", None, str),
+        output_dir=("output directory for converted file", "positional", None, str),
+        n_sents=("Number of sentences per doc", "option", "n", float),
+        morphology=("Enable appending morphology to tags", "flag", "m", bool)
+    )
+    def convert(self, input_file, output_dir, n_sents=10, morphology=False):
+        """
+        Convert files into JSON format for use with train command and other
+        experiment management functions.
+        """
+
+        cli_convert(input_file, output_dir, n_sents, morphology)
+

    def __missing__(self, name):
        print("\n   Command %r does not exist."
@ -119,6 +134,5 @@ class CLI(object):
 if __name__ == '__main__':
    import plac
    import sys
-    cli = CLI()
    sys.argv[0] = 'spacy'
    plac.Interpreter.call(CLI)
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -4,3 +4,4 @@ from .link import link
 from .package import package
 from .train import train, train_config
 from .model import model
+from .convert import convert
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -0,0 +1,36 @@
+# coding: utf8
+from __future__ import unicode_literals, division, print_function
+
+import io
+from pathlib import Path, PurePosixPath
+
+from .converters import conllu2json
+from .. import util
+
+
+# Converters are matched by file extension. To add a converter, add a new entry
+# to this dict with the file extension mapped to the converter function imported
+# from /converters.
+
+CONVERTERS = {
+    '.conllu': conllu2json
+}
+
+
+def convert(input_file, output_dir, *args):
+    input_path = Path(input_file)
+    output_path = Path(output_dir)
+    check_dirs(input_path, output_path)
+    file_ext = input_path.suffix
+    if file_ext in CONVERTERS:
+        CONVERTERS[file_ext](input_path, output_path, *args)
+    else:
+        util.sys_exit("Can't find converter for {}".format(input_path.parts[-1]),
+                      title="Unknown format")
+
+
+def check_dirs(input_file, output_path):
+    if not input_file.exists():
+        util.sys_exit(input_file.as_posix(), title="Input file not found")
+    if not output_path.exists():
+        util.sys_exit(output_path.as_posix(), title="Output directory not found")
--- a/spacy/cli/converters/init.py
+++ b/spacy/cli/converters/init.py
@ -0,0 +1 @@
+from .conllu2json import conllu2json
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -0,0 +1,90 @@
+# coding: utf8
+from __future__ import unicode_literals, division, print_function
+
+import json
+from ...gold import read_json_file, merge_sents
+from ... import util
+
+
+def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
+    """Convert conllu files into JSON format for use with train cli.
+    use_morphology parameter enables appending morphology to tags, which is
+    useful for languages such as Spanish, where UD tags are not so rich.
+    """
+    # by @dvsrepo, via #11 explosion/spacy-dev-resources
+
+    docs = []
+    sentences = []
+    conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
+
+    for i, (raw_text, tokens) in enumerate(conll_tuples):
+        sentence, brackets = tokens[0]
+        sentences.append(generate_sentence(sentence))
+        # Real-sized documents could be extracted using the comments on the
+        # conluu document
+        if(len(sentences) % n_sents == 0):
+            doc = create_doc(sentences, i)
+            docs.append(doc)
+            sentences = []
+
+    output_filename = input_path.parts[-1].replace(".conllu", ".json")
+    output_file = output_path / output_filename
+    json.dump(docs, output_file.open('w', encoding='utf-8'), indent=2)
+    util.print_msg("Created {} documents".format(len(docs)),
+                   title="Generated output file {}".format(output_file))
+
+
+def read_conllx(input_path, use_morphology=False, n=0):
+    text = input_path.open('r', encoding='utf-8').read()
+    i = 0
+    for sent in text.strip().split('\n\n'):
+        lines = sent.strip().split('\n')
+        if lines:
+            while lines[0].startswith('#'):
+                lines.pop(0)
+            tokens = []
+            for line in lines:
+
+                id_, word, lemma, pos, tag, morph, head, dep, _1, \
+                _2 = line.split('\t')
+                if '-' in id_ or '.' in id_:
+                    continue
+                try:
+                    id_ = int(id_) - 1
+                    head = (int(head) - 1) if head != '0' else id_
+                    dep = 'ROOT' if dep == 'root' else dep
+                    tag = pos+'__'+morph  if use_morphology else pos
+                    tokens.append((id_, word, tag, head, dep, 'O'))
+                except:
+                    print(line)
+                    raise
+            tuples = [list(t) for t in zip(*tokens)]
+            yield (None, [[tuples, []]])
+            i += 1
+            if n >= 1 and i >= n:
+                break
+
+
+def generate_sentence(sent):
+    (id_, word, tag, head, dep, _) = sent
+    sentence = {}
+    tokens = []
+    for i, id in enumerate(id_):
+        token = {}
+        token["orth"] = word[id]
+        token["tag"] = tag[id]
+        token["head"] = head[id] - i
+        token["dep"] = dep[id]
+        tokens.append(token)
+    sentence["tokens"] = tokens
+    return sentence
+
+
+def create_doc(sentences,id):
+    doc = {}
+    paragraph = {}
+    doc["id"] = id
+    doc["paragraphs"] = []
+    paragraph["sentences"] = sentences
+    doc["paragraphs"].append(paragraph)
+    return doc
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals, division, print_function

-
 import json
 from pathlib import Path

--- a/website/docs/usage/cli.jade
+++ b/website/docs/usage/cli.jade
@ -104,35 +104,77 @@ p
        +cell flag
        +cell Show help message and available arguments.

-+h(2, "package") Package
+h(2, "convert") Convert
    +tag experimental

 p
-    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
-    |  from an existing model data directory. All data files are copied over,
-    |  and the meta data can be entered directly from the command line. While
-    |  this feature is still experimental, the required file templates are
-    |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
-    |  This means you need to be connected to the internet to use this command.
+    |  Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
+    |  for use with the #[code train] command and other experiment management
+    |  functions. The right converter is chosen based on the file extension of
+    |  the input file. Currently only supports #[code .conllu].

 +code(false, "bash").
-    python -m spacy package [input_dir] [output_dir] [--force]
+    python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]

 +table(["Argument", "Type", "Description"])
    +row
-        +cell #[code input_dir]
+        +cell #[code input_file]
        +cell positional
-        +cell Path to directory containing model data.
+        +cell Input file.

    +row
        +cell #[code output_dir]
        +cell positional
-        +cell Directory to create package folder in.
+        +cell Output directory for converted JSON file.

    +row
-        +cell #[code --force], #[code -f]
+        +cell #[code --n_sents], #[code -n]
+        +cell option
+        +cell Number of sentences per document.
+
+    +row
+        +cell #[code --morphology], #[code -m]
+        +cell option
+        +cell Enable appending morphology to tags.
+
+    +row
+        +cell #[code --help], #[code -h]
        +cell flag
-        +cell Force overwriting of existing folder in output directory.
+        +cell Show help message and available arguments.
+
+h(2, "model") Model
+    +tag experimental
+
+p Initialise a new model and its data directory.
+
+code(false, "bash").
+    python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code lang]
+        +cell positional
+        +cell Model language.
+
+    +row
+        +cell #[code model_dir]
+        +cell positional
+        +cell Output directory to store the model in.
+
+    +row
+        +cell #[code freqs_data]
+        +cell positional
+        +cell Tab-separated frequencies file.
+
+    +row
+        +cell #[code clusters_data]
+        +cell positional
+        +cell Brown custers file (optional).
+
+    +row
+        +cell #[code vectors_data]
+        +cell positional
+        +cell Word vectors file (optional).

    +row
        +cell #[code --help], #[code -h]
@ -199,3 +241,38 @@ p
        +cell #[code --help], #[code -h]
        +cell flag
        +cell Show help message and available arguments.
+
+h(2, "package") Package
+    +tag experimental
+
+p
+    |  Generate a #[+a("/docs/usage/models#own-models") model Python package]
+    |  from an existing model data directory. All data files are copied over,
+    |  and the meta data can be entered directly from the command line. While
+    |  this feature is still experimental, the required file templates are
+    |  downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
+    |  This means you need to be connected to the internet to use this command.
+
+code(false, "bash").
+    python -m spacy package [input_dir] [output_dir] [--force]
+
+table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code input_dir]
+        +cell positional
+        +cell Path to directory containing model data.
+
+    +row
+        +cell #[code output_dir]
+        +cell positional
+        +cell Directory to create package folder in.
+
+    +row
+        +cell #[code --force], #[code -f]
+        +cell flag
+        +cell Force overwriting of existing folder in output directory.
+
+    +row
+        +cell #[code --help], #[code -h]
+        +cell flag
+        +cell Show help message and available arguments.
--- a/website/docs/usage/troubleshooting.jade
+++ b/website/docs/usage/troubleshooting.jade
@ -137,6 +137,20 @@ p
    |  #[code python -m spacy download en]. For more info on this, see the
    |  #[+a("/docs/usage/cli") CLI documentation].

+h(3, "module-load") 'module' object has no attribute 'load'
+
+code(false, "text").
+    AttributeError: 'module' object has no attribute 'load'
+
+p
+    |  While this could technically have many causes, including spaCy being
+    |  broken, the most likely one is that your script's file or directory name
+    |  is "shadowing" the module – e.g. your file is called #[code spacy.py],
+    |  or a directory you're importing from is called #[code spacy].
+
+infobox("Solution")
+    |  When using spaCy, never call anything else #[code spacy].
+
 +h(2, "usage") Using spaCy

 +h(3, "pos-lemma-number") POS tag or lemma is returned as number