mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 08:12:24 +03:00
Update feature/lemmatizer from develop
This commit is contained in:
commit
78aba46530
|
@ -1,21 +0,0 @@
|
||||||
environment:
|
|
||||||
matrix:
|
|
||||||
- PYTHON: "C:\\Python35-x64"
|
|
||||||
- PYTHON: "C:\\Python36-x64"
|
|
||||||
- PYTHON: "C:\\Python37-x64"
|
|
||||||
install:
|
|
||||||
# We need wheel installed to build wheels
|
|
||||||
- "%PYTHON%\\python.exe -m pip install wheel"
|
|
||||||
- "%PYTHON%\\python.exe -m pip install cython"
|
|
||||||
- "%PYTHON%\\python.exe -m pip install -r requirements.txt"
|
|
||||||
- "%PYTHON%\\python.exe -m pip install -e ."
|
|
||||||
build: off
|
|
||||||
test_script:
|
|
||||||
- "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
|
|
||||||
after_test:
|
|
||||||
- "%PYTHON%\\python.exe setup.py bdist_wheel"
|
|
||||||
artifacts:
|
|
||||||
- path: dist\*
|
|
||||||
branches:
|
|
||||||
except:
|
|
||||||
- spacy.io
|
|
15
.travis.yml
15
.travis.yml
|
@ -5,23 +5,16 @@ dist: trusty
|
||||||
group: edge
|
group: edge
|
||||||
python:
|
python:
|
||||||
- "2.7"
|
- "2.7"
|
||||||
- "3.5"
|
|
||||||
- "3.6"
|
|
||||||
os:
|
os:
|
||||||
- linux
|
- linux
|
||||||
env:
|
|
||||||
- VIA=compile
|
|
||||||
- VIA=flake8
|
|
||||||
install:
|
install:
|
||||||
- "./travis.sh"
|
- "pip install -r requirements.txt"
|
||||||
- pip install flake8
|
- "python setup.py build_ext --inplace"
|
||||||
|
- "pip install -e ."
|
||||||
script:
|
script:
|
||||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||||
- "pip install pytest pytest-timeout"
|
- "pip install pytest pytest-timeout"
|
||||||
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
- "python -m pytest --tb=native spacy"
|
||||||
- if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
|
|
||||||
- if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
|
||||||
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
|
|
||||||
branches:
|
branches:
|
||||||
except:
|
except:
|
||||||
- spacy.io
|
- spacy.io
|
||||||
|
|
|
@ -14,8 +14,8 @@ released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
|
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
[](https://travis-ci.org/explosion/spaCy)
|
[](https://travis-ci.org/explosion/spaCy)
|
||||||
[](https://ci.appveyor.com/project/explosion/spaCy)
|
|
||||||
[](https://github.com/explosion/spaCy/releases)
|
[](https://github.com/explosion/spaCy/releases)
|
||||||
[](https://pypi.python.org/pypi/spacy)
|
[](https://pypi.python.org/pypi/spacy)
|
||||||
[](https://anaconda.org/conda-forge/spacy)
|
[](https://anaconda.org/conda-forge/spacy)
|
||||||
|
|
92
azure-pipelines.yml
Normal file
92
azure-pipelines.yml
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
trigger:
|
||||||
|
batch: true
|
||||||
|
branches:
|
||||||
|
include:
|
||||||
|
- '*'
|
||||||
|
exclude:
|
||||||
|
- 'spacy.io'
|
||||||
|
paths:
|
||||||
|
exclude:
|
||||||
|
- 'website/*'
|
||||||
|
- '*.md'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
|
||||||
|
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||||
|
# defined in .flake8 and overwrites the selected codes.
|
||||||
|
- job: 'Validate'
|
||||||
|
pool:
|
||||||
|
vmImage: 'ubuntu-16.04'
|
||||||
|
steps:
|
||||||
|
- task: UsePythonVersion@0
|
||||||
|
inputs:
|
||||||
|
versionSpec: '3.7'
|
||||||
|
- script: |
|
||||||
|
pip install flake8
|
||||||
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||||
|
displayName: 'flake8'
|
||||||
|
|
||||||
|
- job: 'Test'
|
||||||
|
dependsOn: 'Validate'
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
# Python 2.7 currently doesn't work because it seems to be a narrow
|
||||||
|
# unicode build, which causes problems with the regular expressions
|
||||||
|
|
||||||
|
# Python27Linux:
|
||||||
|
# imageName: 'ubuntu-16.04'
|
||||||
|
# python.version: '2.7'
|
||||||
|
# Python27Mac:
|
||||||
|
# imageName: 'macos-10.13'
|
||||||
|
# python.version: '2.7'
|
||||||
|
Python35Linux:
|
||||||
|
imageName: 'ubuntu-16.04'
|
||||||
|
python.version: '3.5'
|
||||||
|
Python35Windows:
|
||||||
|
imageName: 'vs2017-win2016'
|
||||||
|
python.version: '3.5'
|
||||||
|
Python35Mac:
|
||||||
|
imageName: 'macos-10.13'
|
||||||
|
python.version: '3.5'
|
||||||
|
Python36Linux:
|
||||||
|
imageName: 'ubuntu-16.04'
|
||||||
|
python.version: '3.6'
|
||||||
|
Python36Windows:
|
||||||
|
imageName: 'vs2017-win2016'
|
||||||
|
python.version: '3.6'
|
||||||
|
Python36Mac:
|
||||||
|
imageName: 'macos-10.13'
|
||||||
|
python.version: '3.6'
|
||||||
|
Python37Linux:
|
||||||
|
imageName: 'ubuntu-16.04'
|
||||||
|
python.version: '3.7'
|
||||||
|
Python37Windows:
|
||||||
|
imageName: 'vs2017-win2016'
|
||||||
|
python.version: '3.7'
|
||||||
|
Python37Mac:
|
||||||
|
imageName: 'macos-10.13'
|
||||||
|
python.version: '3.7'
|
||||||
|
maxParallel: 4
|
||||||
|
pool:
|
||||||
|
vmImage: $(imageName)
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- task: UsePythonVersion@0
|
||||||
|
inputs:
|
||||||
|
versionSpec: '$(python.version)'
|
||||||
|
architecture: 'x64'
|
||||||
|
|
||||||
|
# Downgrading pip is necessary to prevent a wheel version incompatiblity.
|
||||||
|
# Might be fixed in the future or some other way, so investigate again.
|
||||||
|
- script: |
|
||||||
|
python -m pip install --upgrade pip==18.1
|
||||||
|
pip install -r requirements.txt
|
||||||
|
displayName: 'Install dependencies'
|
||||||
|
|
||||||
|
- script: |
|
||||||
|
python setup.py build_ext --inplace
|
||||||
|
pip install -e .
|
||||||
|
displayName: 'Build and install'
|
||||||
|
|
||||||
|
- script: python -m pytest --tb=native spacy
|
||||||
|
displayName: 'Run tests'
|
51
spacy/_ml.py
51
spacy/_ml.py
|
@ -84,16 +84,54 @@ def _zero_init(model):
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc(docs, drop=0.0):
|
def _preprocess_doc(docs, drop=0.0):
|
||||||
keys = [doc.to_array(LOWER) for doc in docs]
|
keys = [doc.to_array(LOWER) for doc in docs]
|
||||||
ops = Model.ops
|
|
||||||
# The dtype here matches what thinc is expecting -- which differs per
|
# The dtype here matches what thinc is expecting -- which differs per
|
||||||
# platform (by int definition). This should be fixed once the problem
|
# platform (by int definition). This should be fixed once the problem
|
||||||
# is fixed on Thinc's side.
|
# is fixed on Thinc's side.
|
||||||
lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
|
||||||
keys = ops.xp.concatenate(keys)
|
keys = numpy.concatenate(keys)
|
||||||
vals = ops.allocate(keys.shape) + 1.0
|
vals = numpy.zeros(keys.shape, dtype='f')
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
|
def with_cpu(ops, model):
|
||||||
|
"""Wrap a model that should run on CPU, transferring inputs and outputs
|
||||||
|
as necessary."""
|
||||||
|
model.to_cpu()
|
||||||
|
def with_cpu_forward(inputs, drop=0.):
|
||||||
|
cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
|
||||||
|
gpu_outputs = _to_device(ops, cpu_outputs)
|
||||||
|
|
||||||
|
def with_cpu_backprop(d_outputs, sgd=None):
|
||||||
|
cpu_d_outputs = _to_cpu(d_outputs)
|
||||||
|
return backprop(cpu_d_outputs, sgd=sgd)
|
||||||
|
|
||||||
|
return gpu_outputs, with_cpu_backprop
|
||||||
|
|
||||||
|
return wrap(with_cpu_forward, model)
|
||||||
|
|
||||||
|
|
||||||
|
def _to_cpu(X):
|
||||||
|
if isinstance(X, numpy.ndarray):
|
||||||
|
return X
|
||||||
|
elif isinstance(X, tuple):
|
||||||
|
return tuple([_to_cpu(x) for x in X])
|
||||||
|
elif isinstance(X, list):
|
||||||
|
return [_to_cpu(x) for x in X]
|
||||||
|
elif hasattr(X, 'get'):
|
||||||
|
return X.get()
|
||||||
|
else:
|
||||||
|
return X
|
||||||
|
|
||||||
|
|
||||||
|
def _to_device(ops, X):
|
||||||
|
if isinstance(X, tuple):
|
||||||
|
return tuple([_to_device(ops, x) for x in X])
|
||||||
|
elif isinstance(X, list):
|
||||||
|
return [_to_device(ops, x) for x in X]
|
||||||
|
else:
|
||||||
|
return ops.asarray(X)
|
||||||
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def _preprocess_doc_bigrams(docs, drop=0.0):
|
def _preprocess_doc_bigrams(docs, drop=0.0):
|
||||||
unigrams = [doc.to_array(LOWER) for doc in docs]
|
unigrams = [doc.to_array(LOWER) for doc in docs]
|
||||||
|
@ -655,7 +693,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||||
)
|
)
|
||||||
|
|
||||||
linear_model = _preprocess_doc >> LinearModel(nr_class)
|
linear_model = (
|
||||||
|
_preprocess_doc
|
||||||
|
>> with_cpu(Model.ops, LinearModel(nr_class))
|
||||||
|
)
|
||||||
if cfg.get("exclusive_classes"):
|
if cfg.get("exclusive_classes"):
|
||||||
output_layer = Softmax(nr_class, nr_class * 2)
|
output_layer = Softmax(nr_class, nr_class * 2)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -23,15 +23,16 @@ CONVERTERS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
# File types
|
# File types
|
||||||
FILE_TYPES = ("json", "jsonl")
|
FILE_TYPES = ("json", "jsonl", "msg")
|
||||||
|
FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
input_file=("Input file", "positional", None, str),
|
input_file=("Input file", "positional", None, str),
|
||||||
output_dir=("Output directory for converted file", "positional", None, str),
|
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||||
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||||
)
|
)
|
||||||
|
@ -58,6 +59,13 @@ def convert(
|
||||||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||||
|
# TODO: support msgpack via stdout in srsly?
|
||||||
|
msg.fail(
|
||||||
|
"Can't write .{} data to stdout.".format(file_type),
|
||||||
|
"Please specify an output directory.",
|
||||||
|
exits=1,
|
||||||
|
)
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
msg.fail("Input file not found", input_path, exits=1)
|
msg.fail("Input file not found", input_path, exits=1)
|
||||||
if output_dir != "-" and not Path(output_dir).exists():
|
if output_dir != "-" and not Path(output_dir).exists():
|
||||||
|
@ -78,6 +86,8 @@ def convert(
|
||||||
srsly.write_json(output_file, data)
|
srsly.write_json(output_file, data)
|
||||||
elif file_type == "jsonl":
|
elif file_type == "jsonl":
|
||||||
srsly.write_jsonl(output_file, data)
|
srsly.write_jsonl(output_file, data)
|
||||||
|
elif file_type == "msg":
|
||||||
|
srsly.write_msgpack(output_file, data)
|
||||||
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
|
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
|
||||||
else:
|
else:
|
||||||
# Print to stdout
|
# Print to stdout
|
||||||
|
|
|
@ -342,7 +342,7 @@ class Errors(object):
|
||||||
"equal to span length ({span_len}).")
|
"equal to span length ({span_len}).")
|
||||||
E122 = ("Cannot find token to be split. Did it get merged?")
|
E122 = ("Cannot find token to be split. Did it get merged?")
|
||||||
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
||||||
E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
|
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
|
||||||
E125 = ("Unexpected value: {value}")
|
E125 = ("Unexpected value: {value}")
|
||||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
|
|
|
@ -153,10 +153,13 @@ class GoldCorpus(object):
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.parts[-1].endswith("json"):
|
if loc.parts[-1].endswith("json"):
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
|
elif loc.parts[-1].endswith("jsonl"):
|
||||||
|
gold_tuples = srsly.read_jsonl(loc)
|
||||||
elif loc.parts[-1].endswith("msg"):
|
elif loc.parts[-1].endswith("msg"):
|
||||||
gold_tuples = srsly.read_msgpack(loc)
|
gold_tuples = srsly.read_msgpack(loc)
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E124.format(path=path2str(loc)))
|
supported = ("json", "jsonl", "msg")
|
||||||
|
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
||||||
for item in gold_tuples:
|
for item in gold_tuples:
|
||||||
yield item
|
yield item
|
||||||
i += len(item[1])
|
i += len(item[1])
|
||||||
|
|
|
@ -21,7 +21,9 @@ _suffixes = (
|
||||||
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
r"(?<=[0-9])%", # 4% -> ["4", "%"]
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
|
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||||
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||||
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
@ -379,7 +379,7 @@ _regular_exp = [
|
||||||
_regular_exp += [
|
_regular_exp += [
|
||||||
"^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
|
"^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
|
||||||
prefix=p,
|
prefix=p,
|
||||||
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
|
hyphen=HYPHENS, # putting the - first in the [] range avoids having to use a backslash
|
||||||
elision=ELISION,
|
elision=ELISION,
|
||||||
al=ALPHA_LOWER,
|
al=ALPHA_LOWER,
|
||||||
)
|
)
|
||||||
|
@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN)
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = _exc
|
TOKENIZER_EXCEPTIONS = _exc
|
||||||
TOKEN_MATCH = re.compile(
|
TOKEN_MATCH = re.compile(
|
||||||
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
|
"|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
|
||||||
).match
|
).match
|
||||||
|
|
32
travis.sh
32
travis.sh
|
@ -1,32 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
if [ "${VIA}" == "pypi" ]; then
|
|
||||||
rm -rf *
|
|
||||||
pip install spacy-nightly
|
|
||||||
python -m spacy download en
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
|
|
||||||
rm -rf *
|
|
||||||
pip uninstall spacy
|
|
||||||
wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT
|
|
||||||
mv $TRAVIS_COMMIT sdist.tgz
|
|
||||||
pip install -U sdist.tgz
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
if [ "${VIA}" == "compile" ]; then
|
|
||||||
pip install -r requirements.txt
|
|
||||||
python setup.py build_ext --inplace
|
|
||||||
pip install -e .
|
|
||||||
fi
|
|
||||||
|
|
||||||
# mkdir -p corpora/en
|
|
||||||
# cd corpora/en
|
|
||||||
# wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz
|
|
||||||
# tar -xzf WordNet-3.0.tar.gz
|
|
||||||
# mv WordNet-3.0 wordnet
|
|
||||||
# cd ../../
|
|
||||||
# mkdir models/
|
|
||||||
# python bin/init_model.py en lang_data/ corpora/ models/en
|
|
||||||
#fi
|
|
|
@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
|
||||||
extension of the input file.
|
extension of the input file.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
|
$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
|
||||||
[--morphology]
|
[--n-sents] [--morphology] [--lang]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
|
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
|
||||||
| `input_file` | positional | Input file. |
|
| `input_file` | positional | Input file. |
|
||||||
| `output_dir` | positional | Output directory for converted JSON file. |
|
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
|
||||||
| `converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). |
|
||||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||||
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
|
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
|
||||||
|
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||||
|
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
|
||||||
|
|
||||||
The following file format converters are available:
|
### Output file types {new="2.1"}
|
||||||
|
|
||||||
| ID | Description |
|
> #### Which format should I choose?
|
||||||
| ----------------- | --------------------------------------------------------------- |
|
>
|
||||||
| `auto` | Automatically pick converter based on file extension (default). |
|
> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
|
||||||
| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. |
|
> that there's one JSON object per line. Unlike a regular JSON file, it can also
|
||||||
| `ner` | Tab-based named entity recognition format. |
|
> be read in line-by-line and you won't have to parse the _entire file_ first.
|
||||||
| `iob` | IOB or IOB2 named entity recognition format. |
|
> This makes it a very convenient format for larger corpora.
|
||||||
|
|
||||||
|
All output files generated by this command are compatible with
|
||||||
|
[`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
|
| ID | Description |
|
||||||
|
| ------- | --------------------------------- |
|
||||||
|
| `jsonl` | Newline-delimited JSON (default). |
|
||||||
|
| `json` | Regular JSON. |
|
||||||
|
| `msg` | Binary MessagePack format. |
|
||||||
|
|
||||||
|
### Converter options
|
||||||
|
|
||||||
|
<!-- TODO: document jsonl option – maybe update it? -->
|
||||||
|
|
||||||
|
| ID | Description |
|
||||||
|
| ------------------------------ | --------------------------------------------------------------- |
|
||||||
|
| `auto` | Automatically pick converter based on file extension (default). |
|
||||||
|
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||||
|
| `ner` | Tab-based named entity recognition format. |
|
||||||
|
| `iob` | IOB or IOB2 named entity recognition format. |
|
||||||
|
|
||||||
## Train {#train}
|
## Train {#train}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user