diff --git a/spacy/__main__.py b/spacy/__main__.py index a805c984d..7151e3c74 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -10,12 +10,13 @@ from spacy.cli import info as cli_info from spacy.cli import package as cli_package from spacy.cli import train as cli_train from spacy.cli import model as cli_model +from spacy.cli import convert as cli_convert class CLI(object): """Command-line interface for spaCy""" - commands = ('download', 'link', 'info', 'package', 'train', 'model') + commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert') @plac.annotations( model=("model to download (shortcut or model name)", "positional", None, str), @@ -110,6 +111,20 @@ class CLI(object): cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) + @plac.annotations( + input_file=("input file", "positional", None, str), + output_dir=("output directory for converted file", "positional", None, str), + n_sents=("Number of sentences per doc", "option", "n", float), + morphology=("Enable appending morphology to tags", "flag", "m", bool) + ) + def convert(self, input_file, output_dir, n_sents=10, morphology=False): + """ + Convert files into JSON format for use with train command and other + experiment management functions. + """ + + cli_convert(input_file, output_dir, n_sents, morphology) + def __missing__(self, name): print("\n Command %r does not exist." @@ -119,6 +134,5 @@ class CLI(object): if __name__ == '__main__': import plac import sys - cli = CLI() sys.argv[0] = 'spacy' plac.Interpreter.call(CLI) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index b97279dec..d529096ef 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,3 +4,4 @@ from .link import link from .package import package from .train import train, train_config from .model import model +from .convert import convert diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py new file mode 100644 index 000000000..d9a08b385 --- /dev/null +++ b/spacy/cli/convert.py @@ -0,0 +1,36 @@ +# coding: utf8 +from __future__ import unicode_literals, division, print_function + +import io +from pathlib import Path, PurePosixPath + +from .converters import conllu2json +from .. import util + + +# Converters are matched by file extension. To add a converter, add a new entry +# to this dict with the file extension mapped to the converter function imported +# from /converters. + +CONVERTERS = { + '.conllu': conllu2json +} + + +def convert(input_file, output_dir, *args): + input_path = Path(input_file) + output_path = Path(output_dir) + check_dirs(input_path, output_path) + file_ext = input_path.suffix + if file_ext in CONVERTERS: + CONVERTERS[file_ext](input_path, output_path, *args) + else: + util.sys_exit("Can't find converter for {}".format(input_path.parts[-1]), + title="Unknown format") + + +def check_dirs(input_file, output_path): + if not input_file.exists(): + util.sys_exit(input_file.as_posix(), title="Input file not found") + if not output_path.exists(): + util.sys_exit(output_path.as_posix(), title="Output directory not found") diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py new file mode 100644 index 000000000..a26b4ca3f --- /dev/null +++ b/spacy/cli/converters/__init__.py @@ -0,0 +1 @@ +from .conllu2json import conllu2json diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py new file mode 100644 index 000000000..e13b7c81c --- /dev/null +++ b/spacy/cli/converters/conllu2json.py @@ -0,0 +1,90 @@ +# coding: utf8 +from __future__ import unicode_literals, division, print_function + +import json +from ...gold import read_json_file, merge_sents +from ... import util + + +def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): + """Convert conllu files into JSON format for use with train cli. + use_morphology parameter enables appending morphology to tags, which is + useful for languages such as Spanish, where UD tags are not so rich. + """ + # by @dvsrepo, via #11 explosion/spacy-dev-resources + + docs = [] + sentences = [] + conll_tuples = read_conllx(input_path, use_morphology=use_morphology) + + for i, (raw_text, tokens) in enumerate(conll_tuples): + sentence, brackets = tokens[0] + sentences.append(generate_sentence(sentence)) + # Real-sized documents could be extracted using the comments on the + # conluu document + if(len(sentences) % n_sents == 0): + doc = create_doc(sentences, i) + docs.append(doc) + sentences = [] + + output_filename = input_path.parts[-1].replace(".conllu", ".json") + output_file = output_path / output_filename + json.dump(docs, output_file.open('w', encoding='utf-8'), indent=2) + util.print_msg("Created {} documents".format(len(docs)), + title="Generated output file {}".format(output_file)) + + +def read_conllx(input_path, use_morphology=False, n=0): + text = input_path.open('r', encoding='utf-8').read() + i = 0 + for sent in text.strip().split('\n\n'): + lines = sent.strip().split('\n') + if lines: + while lines[0].startswith('#'): + lines.pop(0) + tokens = [] + for line in lines: + + id_, word, lemma, pos, tag, morph, head, dep, _1, \ + _2 = line.split('\t') + if '-' in id_ or '.' in id_: + continue + try: + id_ = int(id_) - 1 + head = (int(head) - 1) if head != '0' else id_ + dep = 'ROOT' if dep == 'root' else dep + tag = pos+'__'+morph if use_morphology else pos + tokens.append((id_, word, tag, head, dep, 'O')) + except: + print(line) + raise + tuples = [list(t) for t in zip(*tokens)] + yield (None, [[tuples, []]]) + i += 1 + if n >= 1 and i >= n: + break + + +def generate_sentence(sent): + (id_, word, tag, head, dep, _) = sent + sentence = {} + tokens = [] + for i, id in enumerate(id_): + token = {} + token["orth"] = word[id] + token["tag"] = tag[id] + token["head"] = head[id] - i + token["dep"] = dep[id] + tokens.append(token) + sentence["tokens"] = tokens + return sentence + + +def create_doc(sentences,id): + doc = {} + paragraph = {} + doc["id"] = id + doc["paragraphs"] = [] + paragraph["sentences"] = sentences + doc["paragraphs"].append(paragraph) + return doc diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 52712fe3c..54338c7a7 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals, division, print_function - import json from pathlib import Path diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 74d6554b0..6c57061db 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -104,35 +104,77 @@ p +cell flag +cell Show help message and available arguments. -+h(2, "package") Package ++h(2, "convert") Convert +tag experimental p - | Generate a #[+a("/docs/usage/models#own-models") model Python package] - | from an existing model data directory. All data files are copied over, - | and the meta data can be entered directly from the command line. While - | this feature is still experimental, the required file templates are - | downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. - | This means you need to be connected to the internet to use this command. + | Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format] + | for use with the #[code train] command and other experiment management + | functions. The right converter is chosen based on the file extension of + | the input file. Currently only supports #[code .conllu]. +code(false, "bash"). - python -m spacy package [input_dir] [output_dir] [--force] + python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology] +table(["Argument", "Type", "Description"]) +row - +cell #[code input_dir] + +cell #[code input_file] +cell positional - +cell Path to directory containing model data. + +cell Input file. +row +cell #[code output_dir] +cell positional - +cell Directory to create package folder in. + +cell Output directory for converted JSON file. +row - +cell #[code --force], #[code -f] + +cell #[code --n_sents], #[code -n] + +cell option + +cell Number of sentences per document. + + +row + +cell #[code --morphology], #[code -m] + +cell option + +cell Enable appending morphology to tags. + + +row + +cell #[code --help], #[code -h] +cell flag - +cell Force overwriting of existing folder in output directory. + +cell Show help message and available arguments. + ++h(2, "model") Model + +tag experimental + +p Initialise a new model and its data directory. + ++code(false, "bash"). + python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code lang] + +cell positional + +cell Model language. + + +row + +cell #[code model_dir] + +cell positional + +cell Output directory to store the model in. + + +row + +cell #[code freqs_data] + +cell positional + +cell Tab-separated frequencies file. + + +row + +cell #[code clusters_data] + +cell positional + +cell Brown custers file (optional). + + +row + +cell #[code vectors_data] + +cell positional + +cell Word vectors file (optional). +row +cell #[code --help], #[code -h] @@ -199,3 +241,38 @@ p +cell #[code --help], #[code -h] +cell flag +cell Show help message and available arguments. + ++h(2, "package") Package + +tag experimental + +p + | Generate a #[+a("/docs/usage/models#own-models") model Python package] + | from an existing model data directory. All data files are copied over, + | and the meta data can be entered directly from the command line. While + | this feature is still experimental, the required file templates are + | downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. + | This means you need to be connected to the internet to use this command. + ++code(false, "bash"). + python -m spacy package [input_dir] [output_dir] [--force] + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code input_dir] + +cell positional + +cell Path to directory containing model data. + + +row + +cell #[code output_dir] + +cell positional + +cell Directory to create package folder in. + + +row + +cell #[code --force], #[code -f] + +cell flag + +cell Force overwriting of existing folder in output directory. + + +row + +cell #[code --help], #[code -h] + +cell flag + +cell Show help message and available arguments. diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade index cb8271343..501a250c8 100644 --- a/website/docs/usage/troubleshooting.jade +++ b/website/docs/usage/troubleshooting.jade @@ -137,6 +137,20 @@ p | #[code python -m spacy download en]. For more info on this, see the | #[+a("/docs/usage/cli") CLI documentation]. ++h(3, "module-load") 'module' object has no attribute 'load' + ++code(false, "text"). + AttributeError: 'module' object has no attribute 'load' + +p + | While this could technically have many causes, including spaCy being + | broken, the most likely one is that your script's file or directory name + | is "shadowing" the module – e.g. your file is called #[code spacy.py], + | or a directory you're importing from is called #[code spacy]. + ++infobox("Solution") + | When using spaCy, never call anything else #[code spacy]. + +h(2, "usage") Using spaCy +h(3, "pos-lemma-number") POS tag or lemma is returned as number