mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-29 09:23:12 +03:00
Merge remote-tracking branch 'origin/develop'
This commit is contained in:
commit
4a6204dbad
|
@ -10,12 +10,13 @@ from spacy.cli import info as cli_info
|
||||||
from spacy.cli import package as cli_package
|
from spacy.cli import package as cli_package
|
||||||
from spacy.cli import train as cli_train
|
from spacy.cli import train as cli_train
|
||||||
from spacy.cli import model as cli_model
|
from spacy.cli import model as cli_model
|
||||||
|
from spacy.cli import convert as cli_convert
|
||||||
|
|
||||||
|
|
||||||
class CLI(object):
|
class CLI(object):
|
||||||
"""Command-line interface for spaCy"""
|
"""Command-line interface for spaCy"""
|
||||||
|
|
||||||
commands = ('download', 'link', 'info', 'package', 'train', 'model')
|
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("model to download (shortcut or model name)", "positional", None, str),
|
model=("model to download (shortcut or model name)", "positional", None, str),
|
||||||
|
@ -110,6 +111,20 @@ class CLI(object):
|
||||||
|
|
||||||
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
input_file=("input file", "positional", None, str),
|
||||||
|
output_dir=("output directory for converted file", "positional", None, str),
|
||||||
|
n_sents=("Number of sentences per doc", "option", "n", float),
|
||||||
|
morphology=("Enable appending morphology to tags", "flag", "m", bool)
|
||||||
|
)
|
||||||
|
def convert(self, input_file, output_dir, n_sents=10, morphology=False):
|
||||||
|
"""
|
||||||
|
Convert files into JSON format for use with train command and other
|
||||||
|
experiment management functions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
cli_convert(input_file, output_dir, n_sents, morphology)
|
||||||
|
|
||||||
|
|
||||||
def __missing__(self, name):
|
def __missing__(self, name):
|
||||||
print("\n Command %r does not exist."
|
print("\n Command %r does not exist."
|
||||||
|
@ -119,6 +134,5 @@ class CLI(object):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
cli = CLI()
|
|
||||||
sys.argv[0] = 'spacy'
|
sys.argv[0] = 'spacy'
|
||||||
plac.Interpreter.call(CLI)
|
plac.Interpreter.call(CLI)
|
||||||
|
|
|
@ -4,3 +4,4 @@ from .link import link
|
||||||
from .package import package
|
from .package import package
|
||||||
from .train import train, train_config
|
from .train import train, train_config
|
||||||
from .model import model
|
from .model import model
|
||||||
|
from .convert import convert
|
||||||
|
|
36
spacy/cli/convert.py
Normal file
36
spacy/cli/convert.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
import io
|
||||||
|
from pathlib import Path, PurePosixPath
|
||||||
|
|
||||||
|
from .converters import conllu2json
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
# Converters are matched by file extension. To add a converter, add a new entry
|
||||||
|
# to this dict with the file extension mapped to the converter function imported
|
||||||
|
# from /converters.
|
||||||
|
|
||||||
|
CONVERTERS = {
|
||||||
|
'.conllu': conllu2json
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def convert(input_file, output_dir, *args):
|
||||||
|
input_path = Path(input_file)
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
check_dirs(input_path, output_path)
|
||||||
|
file_ext = input_path.suffix
|
||||||
|
if file_ext in CONVERTERS:
|
||||||
|
CONVERTERS[file_ext](input_path, output_path, *args)
|
||||||
|
else:
|
||||||
|
util.sys_exit("Can't find converter for {}".format(input_path.parts[-1]),
|
||||||
|
title="Unknown format")
|
||||||
|
|
||||||
|
|
||||||
|
def check_dirs(input_file, output_path):
|
||||||
|
if not input_file.exists():
|
||||||
|
util.sys_exit(input_file.as_posix(), title="Input file not found")
|
||||||
|
if not output_path.exists():
|
||||||
|
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
1
spacy/cli/converters/__init__.py
Normal file
1
spacy/cli/converters/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from .conllu2json import conllu2json
|
90
spacy/cli/converters/conllu2json.py
Normal file
90
spacy/cli/converters/conllu2json.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
import json
|
||||||
|
from ...gold import read_json_file, merge_sents
|
||||||
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
|
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
|
"""Convert conllu files into JSON format for use with train cli.
|
||||||
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
|
"""
|
||||||
|
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
sentences = []
|
||||||
|
conll_tuples = read_conllx(input_path, use_morphology=use_morphology)
|
||||||
|
|
||||||
|
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||||
|
sentence, brackets = tokens[0]
|
||||||
|
sentences.append(generate_sentence(sentence))
|
||||||
|
# Real-sized documents could be extracted using the comments on the
|
||||||
|
# conluu document
|
||||||
|
if(len(sentences) % n_sents == 0):
|
||||||
|
doc = create_doc(sentences, i)
|
||||||
|
docs.append(doc)
|
||||||
|
sentences = []
|
||||||
|
|
||||||
|
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||||
|
output_file = output_path / output_filename
|
||||||
|
json.dump(docs, output_file.open('w', encoding='utf-8'), indent=2)
|
||||||
|
util.print_msg("Created {} documents".format(len(docs)),
|
||||||
|
title="Generated output file {}".format(output_file))
|
||||||
|
|
||||||
|
|
||||||
|
def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
|
text = input_path.open('r', encoding='utf-8').read()
|
||||||
|
i = 0
|
||||||
|
for sent in text.strip().split('\n\n'):
|
||||||
|
lines = sent.strip().split('\n')
|
||||||
|
if lines:
|
||||||
|
while lines[0].startswith('#'):
|
||||||
|
lines.pop(0)
|
||||||
|
tokens = []
|
||||||
|
for line in lines:
|
||||||
|
|
||||||
|
id_, word, lemma, pos, tag, morph, head, dep, _1, \
|
||||||
|
_2 = line.split('\t')
|
||||||
|
if '-' in id_ or '.' in id_:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
id_ = int(id_) - 1
|
||||||
|
head = (int(head) - 1) if head != '0' else id_
|
||||||
|
dep = 'ROOT' if dep == 'root' else dep
|
||||||
|
tag = pos+'__'+morph if use_morphology else pos
|
||||||
|
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||||
|
except:
|
||||||
|
print(line)
|
||||||
|
raise
|
||||||
|
tuples = [list(t) for t in zip(*tokens)]
|
||||||
|
yield (None, [[tuples, []]])
|
||||||
|
i += 1
|
||||||
|
if n >= 1 and i >= n:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sentence(sent):
|
||||||
|
(id_, word, tag, head, dep, _) = sent
|
||||||
|
sentence = {}
|
||||||
|
tokens = []
|
||||||
|
for i, id in enumerate(id_):
|
||||||
|
token = {}
|
||||||
|
token["orth"] = word[id]
|
||||||
|
token["tag"] = tag[id]
|
||||||
|
token["head"] = head[id] - i
|
||||||
|
token["dep"] = dep[id]
|
||||||
|
tokens.append(token)
|
||||||
|
sentence["tokens"] = tokens
|
||||||
|
return sentence
|
||||||
|
|
||||||
|
|
||||||
|
def create_doc(sentences,id):
|
||||||
|
doc = {}
|
||||||
|
paragraph = {}
|
||||||
|
doc["id"] = id
|
||||||
|
doc["paragraphs"] = []
|
||||||
|
paragraph["sentences"] = sentences
|
||||||
|
doc["paragraphs"].append(paragraph)
|
||||||
|
return doc
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals, division, print_function
|
from __future__ import unicode_literals, division, print_function
|
||||||
|
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
|
@ -104,35 +104,77 @@ p
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
+h(2, "package") Package
|
+h(2, "convert") Convert
|
||||||
+tag experimental
|
+tag experimental
|
||||||
|
|
||||||
p
|
p
|
||||||
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
|
| Convert files into spaCy's #[+a("/docs/api/annotation#json-input") JSON format]
|
||||||
| from an existing model data directory. All data files are copied over,
|
| for use with the #[code train] command and other experiment management
|
||||||
| and the meta data can be entered directly from the command line. While
|
| functions. The right converter is chosen based on the file extension of
|
||||||
| this feature is still experimental, the required file templates are
|
| the input file. Currently only supports #[code .conllu].
|
||||||
| downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
|
|
||||||
| This means you need to be connected to the internet to use this command.
|
|
||||||
|
|
||||||
+code(false, "bash").
|
+code(false, "bash").
|
||||||
python -m spacy package [input_dir] [output_dir] [--force]
|
python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]
|
||||||
|
|
||||||
+table(["Argument", "Type", "Description"])
|
+table(["Argument", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code input_dir]
|
+cell #[code input_file]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Path to directory containing model data.
|
+cell Input file.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code output_dir]
|
+cell #[code output_dir]
|
||||||
+cell positional
|
+cell positional
|
||||||
+cell Directory to create package folder in.
|
+cell Output directory for converted JSON file.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --force], #[code -f]
|
+cell #[code --n_sents], #[code -n]
|
||||||
|
+cell option
|
||||||
|
+cell Number of sentences per document.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --morphology], #[code -m]
|
||||||
|
+cell option
|
||||||
|
+cell Enable appending morphology to tags.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --help], #[code -h]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Force overwriting of existing folder in output directory.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
|
+h(2, "model") Model
|
||||||
|
+tag experimental
|
||||||
|
|
||||||
|
p Initialise a new model and its data directory.
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data]
|
||||||
|
|
||||||
|
+table(["Argument", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code lang]
|
||||||
|
+cell positional
|
||||||
|
+cell Model language.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code model_dir]
|
||||||
|
+cell positional
|
||||||
|
+cell Output directory to store the model in.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code freqs_data]
|
||||||
|
+cell positional
|
||||||
|
+cell Tab-separated frequencies file.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code clusters_data]
|
||||||
|
+cell positional
|
||||||
|
+cell Brown custers file (optional).
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code vectors_data]
|
||||||
|
+cell positional
|
||||||
|
+cell Word vectors file (optional).
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --help], #[code -h]
|
+cell #[code --help], #[code -h]
|
||||||
|
@ -199,3 +241,38 @@ p
|
||||||
+cell #[code --help], #[code -h]
|
+cell #[code --help], #[code -h]
|
||||||
+cell flag
|
+cell flag
|
||||||
+cell Show help message and available arguments.
|
+cell Show help message and available arguments.
|
||||||
|
|
||||||
|
+h(2, "package") Package
|
||||||
|
+tag experimental
|
||||||
|
|
||||||
|
p
|
||||||
|
| Generate a #[+a("/docs/usage/models#own-models") model Python package]
|
||||||
|
| from an existing model data directory. All data files are copied over,
|
||||||
|
| and the meta data can be entered directly from the command line. While
|
||||||
|
| this feature is still experimental, the required file templates are
|
||||||
|
| downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub].
|
||||||
|
| This means you need to be connected to the internet to use this command.
|
||||||
|
|
||||||
|
+code(false, "bash").
|
||||||
|
python -m spacy package [input_dir] [output_dir] [--force]
|
||||||
|
|
||||||
|
+table(["Argument", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code input_dir]
|
||||||
|
+cell positional
|
||||||
|
+cell Path to directory containing model data.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code output_dir]
|
||||||
|
+cell positional
|
||||||
|
+cell Directory to create package folder in.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --force], #[code -f]
|
||||||
|
+cell flag
|
||||||
|
+cell Force overwriting of existing folder in output directory.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code --help], #[code -h]
|
||||||
|
+cell flag
|
||||||
|
+cell Show help message and available arguments.
|
||||||
|
|
|
@ -137,6 +137,20 @@ p
|
||||||
| #[code python -m spacy download en]. For more info on this, see the
|
| #[code python -m spacy download en]. For more info on this, see the
|
||||||
| #[+a("/docs/usage/cli") CLI documentation].
|
| #[+a("/docs/usage/cli") CLI documentation].
|
||||||
|
|
||||||
|
+h(3, "module-load") 'module' object has no attribute 'load'
|
||||||
|
|
||||||
|
+code(false, "text").
|
||||||
|
AttributeError: 'module' object has no attribute 'load'
|
||||||
|
|
||||||
|
p
|
||||||
|
| While this could technically have many causes, including spaCy being
|
||||||
|
| broken, the most likely one is that your script's file or directory name
|
||||||
|
| is "shadowing" the module – e.g. your file is called #[code spacy.py],
|
||||||
|
| or a directory you're importing from is called #[code spacy].
|
||||||
|
|
||||||
|
+infobox("Solution")
|
||||||
|
| When using spaCy, never call anything else #[code spacy].
|
||||||
|
|
||||||
+h(2, "usage") Using spaCy
|
+h(2, "usage") Using spaCy
|
||||||
|
|
||||||
+h(3, "pos-lemma-number") POS tag or lemma is returned as number
|
+h(3, "pos-lemma-number") POS tag or lemma is returned as number
|
||||||
|
|
Loading…
Reference in New Issue
Block a user