mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Drop Python 2.7 and 3.5 (#4828)
* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
This commit is contained in:
parent
21b6d6e0a8
commit
db55577c45
23
.travis.yml
23
.travis.yml
|
@ -1,23 +0,0 @@
|
||||||
language: python
|
|
||||||
sudo: false
|
|
||||||
cache: pip
|
|
||||||
dist: trusty
|
|
||||||
group: edge
|
|
||||||
python:
|
|
||||||
- "2.7"
|
|
||||||
os:
|
|
||||||
- linux
|
|
||||||
install:
|
|
||||||
- "pip install -r requirements.txt"
|
|
||||||
- "python setup.py build_ext --inplace"
|
|
||||||
- "pip install -e ."
|
|
||||||
script:
|
|
||||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
|
||||||
- "python -m pytest --tb=native spacy"
|
|
||||||
branches:
|
|
||||||
except:
|
|
||||||
- spacy.io
|
|
||||||
notifications:
|
|
||||||
slack:
|
|
||||||
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
|
|
||||||
email: false
|
|
|
@ -280,23 +280,7 @@ except: # noqa: E722
|
||||||
|
|
||||||
### Python conventions
|
### Python conventions
|
||||||
|
|
||||||
All Python code must be written in an **intersection of Python 2 and Python 3**.
|
All Python code must be written **compatible with Python 3.6+**.
|
||||||
This is easy in Cython, but somewhat ugly in Python. Logic that deals with
|
|
||||||
Python or platform compatibility should only live in
|
|
||||||
[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
|
|
||||||
functions, replacement functions are suffixed with an underscore, for example
|
|
||||||
`unicode_`. If you need to access the user's version or platform information,
|
|
||||||
for example to show more specific error messages, you can use the `is_config()`
|
|
||||||
helper function.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from .compat import unicode_, is_config
|
|
||||||
|
|
||||||
compatible_unicode = unicode_('hello world')
|
|
||||||
if is_config(windows=True, python2=True):
|
|
||||||
print("You are using Python 2 on Windows.")
|
|
||||||
```
|
|
||||||
|
|
||||||
Code that interacts with the file-system should accept objects that follow the
|
Code that interacts with the file-system should accept objects that follow the
|
||||||
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
||||||
If the function is user-facing and takes a path as an argument, it should check
|
If the function is user-facing and takes a path as an argument, it should check
|
||||||
|
|
|
@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||||
[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
|
|
||||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
||||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||||
|
@ -98,7 +97,7 @@ For detailed installation instructions, see the
|
||||||
|
|
||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||||
Studio)
|
Studio)
|
||||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
|
- **Python version**: Python 3.6+ (only 64 bit)
|
||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||||
|
|
||||||
[pip]: https://pypi.org/project/spacy/
|
[pip]: https://pypi.org/project/spacy/
|
||||||
|
@ -269,9 +268,7 @@ and git preinstalled.
|
||||||
Install a version of the
|
Install a version of the
|
||||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
||||||
matches the version that was used to compile your Python interpreter. For
|
matches the version that was used to compile your Python interpreter.
|
||||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
|
|
||||||
VS 2015 (Python 3.5).
|
|
||||||
|
|
||||||
## Run tests
|
## Run tests
|
||||||
|
|
||||||
|
|
|
@ -35,12 +35,6 @@ jobs:
|
||||||
dependsOn: 'Validate'
|
dependsOn: 'Validate'
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
Python35Linux:
|
|
||||||
imageName: 'ubuntu-16.04'
|
|
||||||
python.version: '3.5'
|
|
||||||
Python35Windows:
|
|
||||||
imageName: 'vs2017-win2016'
|
|
||||||
python.version: '3.5'
|
|
||||||
Python36Linux:
|
Python36Linux:
|
||||||
imageName: 'ubuntu-16.04'
|
imageName: 'ubuntu-16.04'
|
||||||
python.version: '3.6'
|
python.version: '3.6'
|
||||||
|
|
|
@ -38,14 +38,14 @@ import argparse
|
||||||
HASH_FILE = "cythonize.json"
|
HASH_FILE = "cythonize.json"
|
||||||
|
|
||||||
|
|
||||||
def process_pyx(fromfile, tofile, language_level="-2"):
|
def process_pyx(fromfile, tofile, language_level="-3"):
|
||||||
print("Processing %s" % fromfile)
|
print("Processing %s" % fromfile)
|
||||||
try:
|
try:
|
||||||
from Cython.Compiler.Version import version as cython_version
|
from Cython.Compiler.Version import version as cython_version
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
|
|
||||||
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
if LooseVersion(cython_version) < LooseVersion("0.25"):
|
||||||
raise Exception("Require Cython >= 0.19")
|
raise Exception("Require Cython >= 0.25")
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
3
fabfile.py
vendored
3
fabfile.py
vendored
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import contextlib
|
import contextlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from fabric.api import local, lcd, env, settings, prefix
|
from fabric.api import local, lcd, env, settings, prefix
|
||||||
|
|
|
@ -11,7 +11,6 @@ catalogue>=0.0.7,<1.1.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
plac>=0.9.6,<1.2.0
|
plac>=0.9.6,<1.2.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
# Optional dependencies
|
# Optional dependencies
|
||||||
jsonschema>=2.6.0,<3.1.0
|
jsonschema>=2.6.0,<3.1.0
|
||||||
|
|
|
@ -16,10 +16,7 @@ classifiers =
|
||||||
Operating System :: MacOS :: MacOS X
|
Operating System :: MacOS :: MacOS X
|
||||||
Operating System :: Microsoft :: Windows
|
Operating System :: Microsoft :: Windows
|
||||||
Programming Language :: Cython
|
Programming Language :: Cython
|
||||||
Programming Language :: Python :: 2
|
|
||||||
Programming Language :: Python :: 2.7
|
|
||||||
Programming Language :: Python :: 3
|
Programming Language :: Python :: 3
|
||||||
Programming Language :: Python :: 3.5
|
|
||||||
Programming Language :: Python :: 3.6
|
Programming Language :: Python :: 3.6
|
||||||
Programming Language :: Python :: 3.7
|
Programming Language :: Python :: 3.7
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
|
@ -30,7 +27,7 @@ zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
scripts =
|
scripts =
|
||||||
bin/spacy
|
bin/spacy
|
||||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
|
python_requires = >=3.6
|
||||||
setup_requires =
|
setup_requires =
|
||||||
wheel
|
wheel
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
|
@ -54,7 +51,6 @@ install_requires =
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
plac>=0.9.6,<1.2.0
|
plac>=0.9.6,<1.2.0
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
|
||||||
|
|
||||||
[options.extras_require]
|
[options.extras_require]
|
||||||
lookups =
|
lookups =
|
||||||
|
|
1
setup.py
1
setup.py
|
@ -1,5 +1,4 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from __future__ import print_function
|
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
import warnings
|
import warnings
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
# NB! This breaks in plac on Python 2!!
|
|
||||||
# from __future__ import unicode_literals
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import plac
|
import plac
|
||||||
import sys
|
import sys
|
||||||
|
@ -32,5 +26,5 @@ if __name__ == "__main__":
|
||||||
if command in commands:
|
if command in commands:
|
||||||
plac.call(commands[command], sys.argv[1:])
|
plac.call(commands[command], sys.argv[1:])
|
||||||
else:
|
else:
|
||||||
available = "Available: {}".format(", ".join(commands))
|
available = f"Available: {', '.join(commands)}"
|
||||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
msg.fail(f"Unknown command: {command}", available, exits=1)
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from collections import OrderedDict
|
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .tokens import Doc, Token, Span
|
from .tokens import Doc, Token, Span
|
||||||
|
@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||||
assert pipeline[index][0] == name
|
assert pipeline[index][0] == name
|
||||||
prev_pipes = pipeline[:index]
|
prev_pipes = pipeline[:index]
|
||||||
pipe_requires = getattr(pipe, "requires", [])
|
pipe_requires = getattr(pipe, "requires", [])
|
||||||
requires = OrderedDict([(annot, False) for annot in pipe_requires])
|
requires = {annot: False for annot in pipe_requires}
|
||||||
if requires:
|
if requires:
|
||||||
for prev_name, prev_pipe in prev_pipes:
|
for prev_name, prev_pipe in prev_pipes:
|
||||||
prev_assigns = getattr(prev_pipe, "assigns", [])
|
prev_assigns = getattr(prev_pipe, "assigns", [])
|
||||||
|
@ -98,15 +94,15 @@ def validate_attrs(values):
|
||||||
for ext_attr, ext_value in value.items():
|
for ext_attr, ext_value in value.items():
|
||||||
# We don't check whether the attribute actually exists
|
# We don't check whether the attribute actually exists
|
||||||
if ext_value is not True: # attr is something like doc._.x.y
|
if ext_value is not True: # attr is something like doc._.x.y
|
||||||
good = "{}._.{}".format(obj_key, ext_attr)
|
good = f"{obj_key}._.{ext_attr}"
|
||||||
bad = "{}.{}".format(good, ".".join(ext_value))
|
bad = f"{good}.{'.'.join(ext_value)}"
|
||||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||||
continue # we can't validate those further
|
continue # we can't validate those further
|
||||||
if attr.endswith("_"): # attr is something like "token.pos_"
|
if attr.endswith("_"): # attr is something like "token.pos_"
|
||||||
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
|
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
|
||||||
if value is not True: # attr is something like doc.x.y
|
if value is not True: # attr is something like doc.x.y
|
||||||
good = "{}.{}".format(obj_key, attr)
|
good = f"{obj_key}.{attr}"
|
||||||
bad = "{}.{}".format(good, ".".join(value))
|
bad = f"{good}.{'.'.join(value)}"
|
||||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||||
obj = objs[obj_key]
|
obj = objs[obj_key]
|
||||||
if not hasattr(obj, attr):
|
if not hasattr(obj, attr):
|
||||||
|
@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False):
|
||||||
msg.table(overview, header=header, divider=True, multiline=True)
|
msg.table(overview, header=header, divider=True, multiline=True)
|
||||||
n_problems = sum(len(p) for p in problems.values())
|
n_problems = sum(len(p) for p in problems.values())
|
||||||
if any(p for p in problems.values()):
|
if any(p for p in problems.values()):
|
||||||
msg.divider("Problems ({})".format(n_problems))
|
msg.divider(f"Problems ({n_problems})")
|
||||||
for name, problem in problems.items():
|
for name, problem in problems.items():
|
||||||
if problem:
|
if problem:
|
||||||
problem = ", ".join(problem)
|
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
|
||||||
msg.warn("'{}' requirements not met: {}".format(name, problem))
|
|
||||||
else:
|
else:
|
||||||
msg.good("No problems found.")
|
msg.good("No problems found.")
|
||||||
if no_print:
|
if no_print:
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
IDS = {
|
IDS = {
|
||||||
"": NULL_ATTR,
|
"": NULL_ATTR,
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# NB: This schema describes the new format of the training data, see #2928
|
# NB: This schema describes the new format of the training data, see #2928
|
||||||
TRAINING_SCHEMA = {
|
TRAINING_SCHEMA = {
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -30,16 +27,18 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
# fmt: off
|
||||||
input_file=("Input file", "positional", None, str),
|
input_file=("Input file", "positional", None, str),
|
||||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES),
|
||||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
||||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
||||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
||||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str),
|
||||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||||
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
|
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,),
|
||||||
|
# fmt: on
|
||||||
)
|
)
|
||||||
def convert(
|
def convert(
|
||||||
input_file,
|
input_file,
|
||||||
|
@ -62,16 +61,10 @@ def convert(
|
||||||
no_print = output_dir == "-"
|
no_print = output_dir == "-"
|
||||||
msg = Printer(no_print=no_print)
|
msg = Printer(no_print=no_print)
|
||||||
input_path = Path(input_file)
|
input_path = Path(input_file)
|
||||||
if file_type not in FILE_TYPES:
|
|
||||||
msg.fail(
|
|
||||||
"Unknown file type: '{}'".format(file_type),
|
|
||||||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||||
# TODO: support msgpack via stdout in srsly?
|
# TODO: support msgpack via stdout in srsly?
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't write .{} data to stdout.".format(file_type),
|
f"Can't write .{file_type} data to stdout",
|
||||||
"Please specify an output directory.",
|
"Please specify an output directory.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -95,7 +88,7 @@ def convert(
|
||||||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||||
)
|
)
|
||||||
if converter not in CONVERTERS:
|
if converter not in CONVERTERS:
|
||||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||||
ner_map = None
|
ner_map = None
|
||||||
if ner_map_path is not None:
|
if ner_map_path is not None:
|
||||||
ner_map = srsly.read_json(ner_map_path)
|
ner_map = srsly.read_json(ner_map_path)
|
||||||
|
@ -113,7 +106,7 @@ def convert(
|
||||||
)
|
)
|
||||||
if output_dir != "-":
|
if output_dir != "-":
|
||||||
# Export data to a file
|
# Export data to a file
|
||||||
suffix = ".{}".format(file_type)
|
suffix = f".{file_type}"
|
||||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
srsly.write_json(output_file, data)
|
srsly.write_json(output_file, data)
|
||||||
|
@ -121,9 +114,7 @@ def convert(
|
||||||
srsly.write_jsonl(output_file, data)
|
srsly.write_jsonl(output_file, data)
|
||||||
elif file_type == "msg":
|
elif file_type == "msg":
|
||||||
srsly.write_msgpack(output_file, data)
|
srsly.write_msgpack(output_file, data)
|
||||||
msg.good(
|
msg.good(f"Generated output file ({len(data)} documents): {output_file}")
|
||||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# Print to stdout
|
# Print to stdout
|
||||||
if file_type == "json":
|
if file_type == "json":
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
@ -64,9 +61,9 @@ def conll_ner2json(
|
||||||
# sentence segmentation required for document segmentation
|
# sentence segmentation required for document segmentation
|
||||||
if n_sents > 0 and not seg_sents:
|
if n_sents > 0 and not seg_sents:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"No sentence boundaries found to use with option `-n {}`. "
|
f"No sentence boundaries found to use with option `-n {n_sents}`. "
|
||||||
"Use `-s` to automatically segment sentences or `-n 0` "
|
f"Use `-s` to automatically segment sentences or `-n 0` "
|
||||||
"to disable.".format(n_sents)
|
f"to disable."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
n_sents_info(msg, n_sents)
|
n_sents_info(msg, n_sents)
|
||||||
|
@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
||||||
if model:
|
if model:
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
if "parser" in nlp.pipe_names:
|
if "parser" in nlp.pipe_names:
|
||||||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
msg.info(f"Segmenting sentences with parser from model '{model}'.")
|
||||||
sentencizer = nlp.get_pipe("parser")
|
sentencizer = nlp.get_pipe("parser")
|
||||||
if not sentencizer:
|
if not sentencizer:
|
||||||
msg.info(
|
msg.info(
|
||||||
|
@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
|
||||||
|
|
||||||
|
|
||||||
def n_sents_info(msg, n_sents):
|
def n_sents_info(msg, n_sents):
|
||||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
msg.info(f"Grouping every {n_sents} sentences into a document.")
|
||||||
if n_sents == 1:
|
if n_sents == 1:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"To generate better training data, you may want to group "
|
"To generate better training data, you may want to group "
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ...gold import docs_to_json
|
from ...gold import docs_to_json
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import plac
|
import plac
|
||||||
|
@ -23,20 +20,17 @@ BLANK_MODEL_THRESHOLD = 2000
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
# fmt: off
|
||||||
lang=("model language", "positional", None, str),
|
lang=("model language", "positional", None, str),
|
||||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||||
base_model=("name of model to update (optional)", "option", "b", str),
|
base_model=("name of model to update (optional)", "option", "b", str),
|
||||||
pipeline=(
|
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
||||||
"Comma-separated names of pipeline components to train",
|
|
||||||
"option",
|
|
||||||
"p",
|
|
||||||
str,
|
|
||||||
),
|
|
||||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||||
|
# fmt: on
|
||||||
)
|
)
|
||||||
def debug_data(
|
def debug_data(
|
||||||
lang,
|
lang,
|
||||||
|
@ -93,15 +87,11 @@ def debug_data(
|
||||||
corpus.train_dataset_without_preprocessing(nlp)
|
corpus.train_dataset_without_preprocessing(nlp)
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
loading_train_error_message = "Training data cannot be loaded: {}".format(
|
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
||||||
str(e)
|
|
||||||
)
|
|
||||||
try:
|
try:
|
||||||
dev_dataset = list(corpus.dev_dataset(nlp))
|
dev_dataset = list(corpus.dev_dataset(nlp))
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
loading_dev_error_message = "Development data cannot be loaded: {}".format(
|
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
||||||
str(e)
|
|
||||||
)
|
|
||||||
if loading_train_error_message or loading_dev_error_message:
|
if loading_train_error_message or loading_dev_error_message:
|
||||||
if loading_train_error_message:
|
if loading_train_error_message:
|
||||||
msg.fail(loading_train_error_message)
|
msg.fail(loading_train_error_message)
|
||||||
|
@ -112,78 +102,66 @@ def debug_data(
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, pipeline)
|
gold_train_data = _compile_gold(train_dataset, pipeline)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline)
|
gold_train_unpreprocessed_data = _compile_gold(
|
||||||
|
train_dataset_unpreprocessed, pipeline
|
||||||
|
)
|
||||||
gold_dev_data = _compile_gold(dev_dataset, pipeline)
|
gold_dev_data = _compile_gold(dev_dataset, pipeline)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
|
||||||
msg.divider("Training stats")
|
msg.divider("Training stats")
|
||||||
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||||
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
msg.fail(f"Pipeline component '{pipe}' not available in factories")
|
||||||
if base_model:
|
if base_model:
|
||||||
msg.text("Starting with base model '{}'".format(base_model))
|
msg.text(f"Starting with base model '{base_model}'")
|
||||||
else:
|
else:
|
||||||
msg.text("Starting with blank model '{}'".format(lang))
|
msg.text(f"Starting with blank model '{lang}'")
|
||||||
msg.text("{} training docs".format(len(train_dataset)))
|
msg.text(f"{len(train_dataset)} training docs")
|
||||||
msg.text("{} evaluation docs".format(len(gold_dev_data)))
|
msg.text(f"{len(gold_dev_data)} evaluation docs")
|
||||||
|
|
||||||
if not len(gold_dev_data):
|
if not len(gold_dev_data):
|
||||||
msg.fail("No evaluation docs")
|
msg.fail("No evaluation docs")
|
||||||
overlap = len(train_texts.intersection(dev_texts))
|
overlap = len(train_texts.intersection(dev_texts))
|
||||||
if overlap:
|
if overlap:
|
||||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
msg.warn(f"{overlap} training examples also in evaluation data")
|
||||||
else:
|
else:
|
||||||
msg.good("No overlap between training and evaluation data")
|
msg.good("No overlap between training and evaluation data")
|
||||||
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||||
text = "Low number of examples to train from a blank model ({})".format(
|
text = (
|
||||||
len(train_dataset)
|
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
||||||
)
|
)
|
||||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
msg.fail(text)
|
msg.fail(text)
|
||||||
else:
|
else:
|
||||||
msg.warn(text)
|
msg.warn(text)
|
||||||
msg.text(
|
msg.text(
|
||||||
"It's recommended to use at least {} examples (minimum {})".format(
|
f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
|
||||||
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
|
||||||
),
|
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
msg.divider("Vocab & Vectors")
|
msg.divider("Vocab & Vectors")
|
||||||
n_words = gold_train_data["n_words"]
|
n_words = gold_train_data["n_words"]
|
||||||
msg.info(
|
msg.info(
|
||||||
"{} total {} in the data ({} unique)".format(
|
f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
|
||||||
n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if gold_train_data["n_misaligned_words"] > 0:
|
if gold_train_data["n_misaligned_words"] > 0:
|
||||||
msg.warn(
|
n_misaligned = gold_train_data["n_misaligned_words"]
|
||||||
"{} misaligned tokens in the training data".format(
|
msg.warn(f"{n_misaligned} misaligned tokens in the training data")
|
||||||
gold_train_data["n_misaligned_words"]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if gold_dev_data["n_misaligned_words"] > 0:
|
if gold_dev_data["n_misaligned_words"] > 0:
|
||||||
msg.warn(
|
n_misaligned = gold_dev_data["n_misaligned_words"]
|
||||||
"{} misaligned tokens in the dev data".format(
|
msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
|
||||||
gold_dev_data["n_misaligned_words"]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
most_common_words = gold_train_data["words"].most_common(10)
|
most_common_words = gold_train_data["words"].most_common(10)
|
||||||
msg.text(
|
msg.text(
|
||||||
"10 most common words: {}".format(
|
f"10 most common words: {_format_labels(most_common_words, counts=True)}",
|
||||||
_format_labels(most_common_words, counts=True)
|
|
||||||
),
|
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
if len(nlp.vocab.vectors):
|
if len(nlp.vocab.vectors):
|
||||||
msg.info(
|
msg.info(
|
||||||
"{} vectors ({} unique keys, {} dimensions)".format(
|
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||||
len(nlp.vocab.vectors),
|
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||||
nlp.vocab.vectors.n_keys,
|
|
||||||
nlp.vocab.vectors_length,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the model")
|
msg.info("No word vectors present in the model")
|
||||||
|
@ -203,19 +181,10 @@ def debug_data(
|
||||||
|
|
||||||
msg.divider("Named Entity Recognition")
|
msg.divider("Named Entity Recognition")
|
||||||
msg.info(
|
msg.info(
|
||||||
"{} new {}, {} existing {}".format(
|
f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
|
||||||
len(new_labels),
|
|
||||||
"label" if len(new_labels) == 1 else "labels",
|
|
||||||
len(existing_labels),
|
|
||||||
"label" if len(existing_labels) == 1 else "labels",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
missing_values = label_counts["-"]
|
missing_values = label_counts["-"]
|
||||||
msg.text(
|
msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
|
||||||
"{} missing {} (tokens with '-' label)".format(
|
|
||||||
missing_values, "value" if missing_values == 1 else "values"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for label in new_labels:
|
for label in new_labels:
|
||||||
if len(label) == 0:
|
if len(label) == 0:
|
||||||
msg.fail("Empty label found in new labels")
|
msg.fail("Empty label found in new labels")
|
||||||
|
@ -226,33 +195,24 @@ def debug_data(
|
||||||
if label != "-"
|
if label != "-"
|
||||||
]
|
]
|
||||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||||
if existing_labels:
|
if existing_labels:
|
||||||
msg.text(
|
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
|
||||||
)
|
|
||||||
|
|
||||||
if gold_train_data["ws_ents"]:
|
if gold_train_data["ws_ents"]:
|
||||||
msg.fail(
|
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
|
||||||
"{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
|
|
||||||
)
|
|
||||||
has_ws_ents_error = True
|
has_ws_ents_error = True
|
||||||
|
|
||||||
for label in new_labels:
|
for label in new_labels:
|
||||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Low number of examples for new label '{}' ({})".format(
|
f"Low number of examples for new label '{label}' ({label_counts[label]})"
|
||||||
label, label_counts[label]
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
has_low_data_warning = True
|
has_low_data_warning = True
|
||||||
|
|
||||||
with msg.loading("Analyzing label distribution..."):
|
with msg.loading("Analyzing label distribution..."):
|
||||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||||
if neg_docs == 0:
|
if neg_docs == 0:
|
||||||
msg.warn(
|
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
|
||||||
)
|
|
||||||
has_no_neg_warning = True
|
has_no_neg_warning = True
|
||||||
|
|
||||||
if not has_low_data_warning:
|
if not has_low_data_warning:
|
||||||
|
@ -264,8 +224,8 @@ def debug_data(
|
||||||
|
|
||||||
if has_low_data_warning:
|
if has_low_data_warning:
|
||||||
msg.text(
|
msg.text(
|
||||||
"To train a new entity type, your data should include at "
|
f"To train a new entity type, your data should include at "
|
||||||
"least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
|
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
if has_no_neg_warning:
|
if has_no_neg_warning:
|
||||||
|
@ -288,27 +248,21 @@ def debug_data(
|
||||||
new_labels = [l for l in labels if l not in model_labels]
|
new_labels = [l for l in labels if l not in model_labels]
|
||||||
existing_labels = [l for l in labels if l in model_labels]
|
existing_labels = [l for l in labels if l in model_labels]
|
||||||
msg.info(
|
msg.info(
|
||||||
"Text Classification: {} new label(s), {} existing label(s)".format(
|
f"Text Classification: {len(new_labels)} new label(s), "
|
||||||
len(new_labels), len(existing_labels)
|
f"{len(existing_labels)} existing label(s)"
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if new_labels:
|
if new_labels:
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
gold_train_data["cats"].most_common(), counts=True
|
gold_train_data["cats"].most_common(), counts=True
|
||||||
)
|
)
|
||||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||||
if existing_labels:
|
if existing_labels:
|
||||||
msg.text(
|
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
|
||||||
)
|
|
||||||
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"The train and dev labels are not the same. "
|
f"The train and dev labels are not the same. "
|
||||||
"Train labels: {}. "
|
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
||||||
"Dev labels: {}.".format(
|
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
||||||
_format_labels(gold_train_data["cats"]),
|
|
||||||
_format_labels(gold_dev_data["cats"]),
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if gold_train_data["n_cats_multilabel"] > 0:
|
if gold_train_data["n_cats_multilabel"] > 0:
|
||||||
msg.info(
|
msg.info(
|
||||||
|
@ -338,27 +292,16 @@ def debug_data(
|
||||||
msg.divider("Part-of-speech Tagging")
|
msg.divider("Part-of-speech Tagging")
|
||||||
labels = [label for label in gold_train_data["tags"]]
|
labels = [label for label in gold_train_data["tags"]]
|
||||||
tag_map = nlp.vocab.morphology.tag_map
|
tag_map = nlp.vocab.morphology.tag_map
|
||||||
msg.info(
|
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||||
"{} {} in data ({} {} in tag map)".format(
|
|
||||||
len(labels),
|
|
||||||
"label" if len(labels) == 1 else "labels",
|
|
||||||
len(tag_map),
|
|
||||||
"label" if len(tag_map) == 1 else "labels",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
gold_train_data["tags"].most_common(), counts=True
|
gold_train_data["tags"].most_common(), counts=True
|
||||||
)
|
)
|
||||||
msg.text(labels_with_counts, show=verbose)
|
msg.text(labels_with_counts, show=verbose)
|
||||||
non_tagmap = [l for l in labels if l not in tag_map]
|
non_tagmap = [l for l in labels if l not in tag_map]
|
||||||
if not non_tagmap:
|
if not non_tagmap:
|
||||||
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
||||||
for label in non_tagmap:
|
for label in non_tagmap:
|
||||||
msg.fail(
|
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||||
"Label '{}' not found in tag map for language '{}'".format(
|
|
||||||
label, nlp.lang
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if "parser" in pipeline:
|
if "parser" in pipeline:
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
|
@ -366,21 +309,18 @@ def debug_data(
|
||||||
|
|
||||||
# profile sentence length
|
# profile sentence length
|
||||||
msg.info(
|
msg.info(
|
||||||
"Found {} sentence{} with an average length of {:.1f} words.".format(
|
f"Found {gold_train_data['n_sents']} sentence(s) with an average "
|
||||||
gold_train_data["n_sents"],
|
f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
|
||||||
"s" if len(train_dataset) > 1 else "",
|
|
||||||
gold_train_data["n_words"] / gold_train_data["n_sents"],
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# check for documents with multiple sentences
|
# check for documents with multiple sentences
|
||||||
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
|
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
|
||||||
if sents_per_doc < 1.1:
|
if sents_per_doc < 1.1:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The training data contains {:.2f} sentences per "
|
f"The training data contains {sents_per_doc:.2f} sentences per "
|
||||||
"document. When there are very few documents containing more "
|
f"document. When there are very few documents containing more "
|
||||||
"than one sentence, the parser will not learn how to segment "
|
f"than one sentence, the parser will not learn how to segment "
|
||||||
"longer texts into sentences.".format(sents_per_doc)
|
f"longer texts into sentences."
|
||||||
)
|
)
|
||||||
|
|
||||||
# profile labels
|
# profile labels
|
||||||
|
@ -391,32 +331,13 @@ def debug_data(
|
||||||
labels_dev = [label for label in gold_dev_data["deps"]]
|
labels_dev = [label for label in gold_dev_data["deps"]]
|
||||||
|
|
||||||
if gold_train_unpreprocessed_data["n_nonproj"] > 0:
|
if gold_train_unpreprocessed_data["n_nonproj"] > 0:
|
||||||
msg.info(
|
n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
|
||||||
"Found {} nonprojective train sentence{}".format(
|
msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
|
||||||
gold_train_unpreprocessed_data["n_nonproj"],
|
|
||||||
"s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if gold_dev_data["n_nonproj"] > 0:
|
if gold_dev_data["n_nonproj"] > 0:
|
||||||
msg.info(
|
n_nonproj = gold_dev_data["n_nonproj"]
|
||||||
"Found {} nonprojective dev sentence{}".format(
|
msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
|
||||||
gold_dev_data["n_nonproj"],
|
msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
|
||||||
"s" if gold_dev_data["n_nonproj"] > 1 else "",
|
msg.info(f"{len(labels_train)} label(s) in projectivized train data")
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
msg.info(
|
|
||||||
"{} {} in train data".format(
|
|
||||||
len(labels_train_unpreprocessed),
|
|
||||||
"label" if len(labels_train) == 1 else "labels",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
msg.info(
|
|
||||||
"{} {} in projectivized train data".format(
|
|
||||||
len(labels_train), "label" if len(labels_train) == 1 else "labels"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
labels_with_counts = _format_labels(
|
labels_with_counts = _format_labels(
|
||||||
gold_train_unpreprocessed_data["deps"].most_common(), counts=True
|
gold_train_unpreprocessed_data["deps"].most_common(), counts=True
|
||||||
)
|
)
|
||||||
|
@ -426,9 +347,8 @@ def debug_data(
|
||||||
for label in gold_train_unpreprocessed_data["deps"]:
|
for label in gold_train_unpreprocessed_data["deps"]:
|
||||||
if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
|
if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Low number of examples for label '{}' ({})".format(
|
f"Low number of examples for label '{label}' "
|
||||||
label, gold_train_unpreprocessed_data["deps"][label]
|
f"({gold_train_unpreprocessed_data['deps'][label]})"
|
||||||
)
|
|
||||||
)
|
)
|
||||||
has_low_data_warning = True
|
has_low_data_warning = True
|
||||||
|
|
||||||
|
@ -437,22 +357,19 @@ def debug_data(
|
||||||
for label in gold_train_data["deps"]:
|
for label in gold_train_data["deps"]:
|
||||||
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
||||||
rare_projectivized_labels.append(
|
rare_projectivized_labels.append(
|
||||||
"{}: {}".format(label, str(gold_train_data["deps"][label]))
|
f"{label}: {gold_train_data['deps'][label]}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(rare_projectivized_labels) > 0:
|
if len(rare_projectivized_labels) > 0:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Low number of examples for {} label{} in the "
|
f"Low number of examples for {len(rare_projectivized_labels)} "
|
||||||
"projectivized dependency trees used for training. You may "
|
"label(s) in the projectivized dependency trees used for "
|
||||||
"want to projectivize labels such as punct before "
|
"training. You may want to projectivize labels such as punct "
|
||||||
"training in order to improve parser performance.".format(
|
"before training in order to improve parser performance."
|
||||||
len(rare_projectivized_labels),
|
|
||||||
"s" if len(rare_projectivized_labels) > 1 else "",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Projectivized labels with low numbers of examples: "
|
f"Projectivized labels with low numbers of examples: ",
|
||||||
"{}".format("\n".join(rare_projectivized_labels)),
|
", ".join(rare_projectivized_labels),
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
has_low_data_warning = True
|
has_low_data_warning = True
|
||||||
|
@ -460,50 +377,44 @@ def debug_data(
|
||||||
# labels only in train
|
# labels only in train
|
||||||
if set(labels_train) - set(labels_dev):
|
if set(labels_train) - set(labels_dev):
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The following labels were found only in the train data: "
|
"The following labels were found only in the train data:",
|
||||||
"{}".format(", ".join(set(labels_train) - set(labels_dev))),
|
", ".join(set(labels_train) - set(labels_dev)),
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
# labels only in dev
|
# labels only in dev
|
||||||
if set(labels_dev) - set(labels_train):
|
if set(labels_dev) - set(labels_train):
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"The following labels were found only in the dev data: "
|
"The following labels were found only in the dev data:",
|
||||||
+ ", ".join(set(labels_dev) - set(labels_train)),
|
", ".join(set(labels_dev) - set(labels_train)),
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
if has_low_data_warning:
|
if has_low_data_warning:
|
||||||
msg.text(
|
msg.text(
|
||||||
"To train a parser, your data should include at "
|
f"To train a parser, your data should include at "
|
||||||
"least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
|
f"least {DEP_LABEL_THRESHOLD} instances of each label.",
|
||||||
show=verbose,
|
show=verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
# multiple root labels
|
# multiple root labels
|
||||||
if len(gold_train_unpreprocessed_data["roots"]) > 1:
|
if len(gold_train_unpreprocessed_data["roots"]) > 1:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Multiple root labels ({}) ".format(
|
f"Multiple root labels "
|
||||||
", ".join(gold_train_unpreprocessed_data["roots"])
|
f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
|
||||||
)
|
f"found in training data. spaCy's parser uses a single root "
|
||||||
+ "found in training data. spaCy's parser uses a single root "
|
f"label ROOT so this distinction will not be available."
|
||||||
"label ROOT so this distinction will not be available."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# these should not happen, but just in case
|
# these should not happen, but just in case
|
||||||
if gold_train_data["n_nonproj"] > 0:
|
if gold_train_data["n_nonproj"] > 0:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Found {} nonprojective projectivized train sentence{}".format(
|
f"Found {gold_train_data['n_nonproj']} nonprojective "
|
||||||
gold_train_data["n_nonproj"],
|
f"projectivized train sentence(s)"
|
||||||
"s" if gold_train_data["n_nonproj"] > 1 else "",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
if gold_train_data["n_cycles"] > 0:
|
if gold_train_data["n_cycles"] > 0:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Found {} projectivized train sentence{} with cycles".format(
|
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||||
gold_train_data["n_cycles"],
|
|
||||||
"s" if gold_train_data["n_cycles"] > 1 else "",
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
msg.divider("Summary")
|
msg.divider("Summary")
|
||||||
|
@ -511,36 +422,28 @@ def debug_data(
|
||||||
warn_counts = msg.counts[MESSAGES.WARN]
|
warn_counts = msg.counts[MESSAGES.WARN]
|
||||||
fail_counts = msg.counts[MESSAGES.FAIL]
|
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||||
if good_counts:
|
if good_counts:
|
||||||
msg.good(
|
msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
|
||||||
"{} {} passed".format(
|
|
||||||
good_counts, "check" if good_counts == 1 else "checks"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if warn_counts:
|
if warn_counts:
|
||||||
msg.warn(
|
msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
|
||||||
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
|
||||||
)
|
|
||||||
if fail_counts:
|
|
||||||
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
|
||||||
|
|
||||||
if fail_counts:
|
if fail_counts:
|
||||||
|
msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def _load_file(file_path, msg):
|
def _load_file(file_path, msg):
|
||||||
file_name = file_path.parts[-1]
|
file_name = file_path.parts[-1]
|
||||||
if file_path.suffix == ".json":
|
if file_path.suffix == ".json":
|
||||||
with msg.loading("Loading {}...".format(file_name)):
|
with msg.loading(f"Loading {file_name}..."):
|
||||||
data = srsly.read_json(file_path)
|
data = srsly.read_json(file_path)
|
||||||
msg.good("Loaded {}".format(file_name))
|
msg.good(f"Loaded {file_name}")
|
||||||
return data
|
return data
|
||||||
elif file_path.suffix == ".jsonl":
|
elif file_path.suffix == ".jsonl":
|
||||||
with msg.loading("Loading {}...".format(file_name)):
|
with msg.loading(f"Loading {file_name}..."):
|
||||||
data = srsly.read_jsonl(file_path)
|
data = srsly.read_jsonl(file_path)
|
||||||
msg.good("Loaded {}".format(file_name))
|
msg.good(f"Loaded {file_name}")
|
||||||
return data
|
return data
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't load file extension {}".format(file_path.suffix),
|
f"Can't load file extension {file_path.suffix}",
|
||||||
"Expected .json or .jsonl",
|
"Expected .json or .jsonl",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -604,14 +507,18 @@ def _compile_gold(examples, pipeline):
|
||||||
|
|
||||||
def _format_labels(labels, counts=False):
|
def _format_labels(labels, counts=False):
|
||||||
if counts:
|
if counts:
|
||||||
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
||||||
return ", ".join(["'{}'".format(l) for l in labels])
|
return ", ".join([f"'{l}'" for l in labels])
|
||||||
|
|
||||||
|
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data, label):
|
||||||
count = 0
|
count = 0
|
||||||
for ex in data:
|
for ex in data:
|
||||||
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
|
labels = [
|
||||||
|
label.split("-")[1]
|
||||||
|
for label in ex.gold.ner
|
||||||
|
if label not in ("O", "-", None)
|
||||||
|
]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
|
@ -50,7 +47,7 @@ def download(model, direct=False, *pip_args):
|
||||||
sys.exit(dl)
|
sys.exit(dl)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
"You can now load the model via spacy.load('{}')".format(model_name),
|
f"You can now load the model via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
# Only create symlink if the model is installed via a shortcut like 'en'.
|
# Only create symlink if the model is installed via a shortcut like 'en'.
|
||||||
# There's no real advantage over an additional symlink for en_core_web_sm
|
# There's no real advantage over an additional symlink for en_core_web_sm
|
||||||
|
@ -69,10 +66,10 @@ def download(model, direct=False, *pip_args):
|
||||||
# message and loading instructions, even if linking fails.
|
# message and loading instructions, even if linking fails.
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Download successful but linking failed",
|
"Download successful but linking failed",
|
||||||
"Creating a shortcut link for '{}' didn't work (maybe you "
|
f"Creating a shortcut link for '{model}' didn't work (maybe you "
|
||||||
"don't have admin permissions?), but you can still load "
|
f"don't have admin permissions?), but you can still load "
|
||||||
"the model via its full package name: "
|
f"the model via its full package name: "
|
||||||
"nlp = spacy.load('{}')".format(model, model_name),
|
f"nlp = spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
# If a model is downloaded and then loaded within the same process, our
|
# If a model is downloaded and then loaded within the same process, our
|
||||||
# is_package check currently fails, because pkg_resources.working_set
|
# is_package check currently fails, because pkg_resources.working_set
|
||||||
|
@ -95,11 +92,11 @@ def get_json(url, desc):
|
||||||
r = requests.get(url)
|
r = requests.get(url)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Server error ({})".format(r.status_code),
|
f"Server error ({r.status_code})",
|
||||||
"Couldn't fetch {}. Please find a model for your spaCy "
|
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
||||||
"installation (v{}), and download it manually. For more "
|
f"installation (v{about.__version__}), and download it manually. "
|
||||||
"details, see the documentation: "
|
f"For more details, see the documentation: "
|
||||||
"https://spacy.io/usage/models".format(desc, about.__version__),
|
f"https://spacy.io/usage/models",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return r.json()
|
return r.json()
|
||||||
|
@ -111,7 +108,7 @@ def get_compatibility():
|
||||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||||
comp = comp_table["spacy"]
|
comp = comp_table["spacy"]
|
||||||
if version not in comp:
|
if version not in comp:
|
||||||
msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
|
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||||
return comp[version]
|
return comp[version]
|
||||||
|
|
||||||
|
|
||||||
|
@ -119,8 +116,8 @@ def get_version(model, comp):
|
||||||
model = model.rsplit(".dev", 1)[0]
|
model = model.rsplit(".dev", 1)[0]
|
||||||
if model not in comp:
|
if model not in comp:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"No compatible model found for '{}' "
|
f"No compatible model found for '{model}' "
|
||||||
"(spaCy v{}).".format(model, about.__version__),
|
f"(spaCy v{about.__version__}).",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
return comp[model][0]
|
return comp[model][0]
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, division, print_function
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
@ -79,7 +76,7 @@ def evaluate(
|
||||||
deps=render_deps,
|
deps=render_deps,
|
||||||
ents=render_ents,
|
ents=render_ents,
|
||||||
)
|
)
|
||||||
msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
|
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||||
if return_scores:
|
if return_scores:
|
||||||
return scorer.scores
|
return scorer.scores
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import platform
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..compat import path2str, basestring_, unicode_
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -33,12 +29,12 @@ def info(model=None, markdown=False, silent=False):
|
||||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
if model_path.resolve() != model_path:
|
if model_path.resolve() != model_path:
|
||||||
meta["link"] = path2str(model_path)
|
meta["link"] = str(model_path)
|
||||||
meta["source"] = path2str(model_path.resolve())
|
meta["source"] = str(model_path.resolve())
|
||||||
else:
|
else:
|
||||||
meta["source"] = path2str(model_path)
|
meta["source"] = str(model_path)
|
||||||
if not silent:
|
if not silent:
|
||||||
title = "Info about model '{}'".format(model)
|
title = f"Info about model '{model}'"
|
||||||
model_meta = {
|
model_meta = {
|
||||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||||
}
|
}
|
||||||
|
@ -49,7 +45,7 @@ def info(model=None, markdown=False, silent=False):
|
||||||
return meta
|
return meta
|
||||||
data = {
|
data = {
|
||||||
"spaCy version": about.__version__,
|
"spaCy version": about.__version__,
|
||||||
"Location": path2str(Path(__file__).parent.parent),
|
"Location": str(Path(__file__).parent.parent),
|
||||||
"Platform": platform.platform(),
|
"Platform": platform.platform(),
|
||||||
"Python version": platform.python_version(),
|
"Python version": platform.python_version(),
|
||||||
"Models": list_models(),
|
"Models": list_models(),
|
||||||
|
@ -84,9 +80,9 @@ def print_markdown(data, title=None):
|
||||||
"""
|
"""
|
||||||
markdown = []
|
markdown = []
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
if isinstance(value, basestring_) and Path(value).exists():
|
if isinstance(value, str) and Path(value).exists():
|
||||||
continue
|
continue
|
||||||
markdown.append("* **{}:** {}".format(key, unicode_(value)))
|
markdown.append(f"* **{key}:** {value}")
|
||||||
if title:
|
if title:
|
||||||
print("\n## {}".format(title))
|
print(f"\n## {title}")
|
||||||
print("\n{}\n".format("\n".join(markdown)))
|
print("\n{}\n".format("\n".join(markdown)))
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import math
|
import math
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -91,8 +88,7 @@ def init_model(
|
||||||
vec_added = len(nlp.vocab.vectors)
|
vec_added = len(nlp.vocab.vectors)
|
||||||
lex_added = len(nlp.vocab)
|
lex_added = len(nlp.vocab)
|
||||||
msg.good(
|
msg.good(
|
||||||
"Sucessfully compiled vocab",
|
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
||||||
"{} entries, {} vectors".format(lex_added, vec_added),
|
|
||||||
)
|
)
|
||||||
if not output_dir.exists():
|
if not output_dir.exists():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
|
@ -177,9 +173,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
||||||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None:
|
if vector_keys is not None:
|
||||||
|
|
|
@ -1,11 +1,8 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ..compat import symlink_to, path2str
|
from ..compat import symlink_to
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,23 +24,23 @@ def link(origin, link_name, force=False, model_path=None):
|
||||||
if not model_path.exists():
|
if not model_path.exists():
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't locate model data",
|
"Can't locate model data",
|
||||||
"The data should be located in {}".format(path2str(model_path)),
|
f"The data should be located in {model_path}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
spacy_loc = Path(__file__).parent.parent
|
spacy_loc = Path(__file__).parent.parent
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't find the spaCy data path to create model symlink",
|
f"Can't find the spaCy data path to create model symlink",
|
||||||
"Make sure a directory `/data` exists within your spaCy "
|
f"Make sure a directory `/data` exists within your spaCy "
|
||||||
"installation and try again. The data directory should be located "
|
f"installation and try again. The data directory should be located "
|
||||||
"here:".format(path=spacy_loc),
|
f"here: {spacy_loc}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
link_path = util.get_data_path() / link_name
|
link_path = util.get_data_path() / link_name
|
||||||
if link_path.is_symlink() and not force:
|
if link_path.is_symlink() and not force:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Link '{}' already exists".format(link_name),
|
f"Link '{link_name}' already exists",
|
||||||
"To overwrite an existing link, use the --force flag",
|
"To overwrite an existing link, use the --force flag",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -54,18 +51,18 @@ def link(origin, link_name, force=False, model_path=None):
|
||||||
elif link_path.exists(): # does it exist otherwise?
|
elif link_path.exists(): # does it exist otherwise?
|
||||||
# NB: Check this last because valid symlinks also "exist".
|
# NB: Check this last because valid symlinks also "exist".
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't overwrite symlink '{}'".format(link_name),
|
f"Can't overwrite symlink '{link_name}'",
|
||||||
"This can happen if your data directory contains a directory or "
|
"This can happen if your data directory contains a directory or "
|
||||||
"file of the same name.",
|
"file of the same name.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
details = f"{model_path} --> {link_path}"
|
||||||
try:
|
try:
|
||||||
symlink_to(link_path, model_path)
|
symlink_to(link_path, model_path)
|
||||||
except: # noqa: E722
|
except: # noqa: E722
|
||||||
# This is quite dirty, but just making sure other errors are caught.
|
# This is quite dirty, but just making sure other errors are caught.
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Couldn't link model to '{}'".format(link_name),
|
f"Couldn't link model to '{link_name}'",
|
||||||
"Creating a symlink in spacy/data failed. Make sure you have the "
|
"Creating a symlink in spacy/data failed. Make sure you have the "
|
||||||
"required permissions and try re-running the command as admin, or "
|
"required permissions and try re-running the command as admin, or "
|
||||||
"use a virtualenv. You can still import the model as a module and "
|
"use a virtualenv. You can still import the model as a module and "
|
||||||
|
@ -74,4 +71,4 @@ def link(origin, link_name, force=False, model_path=None):
|
||||||
msg.text(details)
|
msg.text(details)
|
||||||
raise
|
raise
|
||||||
msg.good("Linking successful", details)
|
msg.good("Linking successful", details)
|
||||||
msg.text("You can now load the model via spacy.load('{}')".format(link_name))
|
msg.text(f"You can now load the model via spacy.load('{link_name}')")
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, get_raw_input
|
from wasabi import msg, get_raw_input
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..compat import path2str
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
||||||
for key in ("lang", "name", "version"):
|
for key in ("lang", "name", "version"):
|
||||||
if key not in meta or meta[key] == "":
|
if key not in meta or meta[key] == "":
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"No '{}' setting found in meta.json".format(key),
|
f"No '{key}' setting found in meta.json",
|
||||||
"This setting is required to build your package.",
|
"This setting is required to build your package.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
||||||
|
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(path2str(package_path))
|
shutil.rmtree(str(package_path))
|
||||||
else:
|
else:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Package directory already exists",
|
"Package directory already exists",
|
||||||
"Please delete the directory and try again, or use the "
|
"Please delete the directory and try again, or use the "
|
||||||
"`--force` flag to overwrite existing "
|
"`--force` flag to overwrite existing directories.",
|
||||||
"directories.".format(path=path2str(package_path)),
|
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
shutil.copytree(str(input_path), str(package_path / model_name_v))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||||
msg.good("Successfully created package '{}'".format(model_name_v), main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
msg.text("To build the package, run `python setup.py sdist` in this directory.")
|
msg.text("To build the package, run `python setup.py sdist` in this directory.")
|
||||||
|
|
||||||
|
|
||||||
|
@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg):
|
||||||
|
|
||||||
TEMPLATE_SETUP = """
|
TEMPLATE_SETUP = """
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
from os import path, walk
|
from os import path, walk
|
||||||
|
@ -190,9 +182,6 @@ include meta.json
|
||||||
|
|
||||||
|
|
||||||
TEMPLATE_INIT = """
|
TEMPLATE_INIT = """
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.util import load_model_from_init_py, get_model_meta
|
from spacy.util import load_model_from_init_py, get_model_meta
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import print_function, unicode_literals
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -154,9 +151,9 @@ def pretrain(
|
||||||
msg.text("Reading input text from stdin...")
|
msg.text("Reading input text from stdin...")
|
||||||
texts = srsly.read_jsonl("-")
|
texts = srsly.read_jsonl("-")
|
||||||
|
|
||||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
with msg.loading(f"Loading model '{vectors_model}'..."):
|
||||||
nlp = util.load_model(vectors_model)
|
nlp = util.load_model(vectors_model)
|
||||||
msg.good("Loaded model '{}'".format(vectors_model))
|
msg.good(f"Loaded model '{vectors_model}'")
|
||||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||||
model = create_pretraining_model(
|
model = create_pretraining_model(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -173,7 +170,7 @@ def pretrain(
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
||||||
# Parse the epoch number from the given weight file
|
# Parse the epoch number from the given weight file
|
||||||
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
||||||
if model_name:
|
if model_name:
|
||||||
|
@ -221,7 +218,9 @@ def pretrain(
|
||||||
skip_counter = 0
|
skip_counter = 0
|
||||||
for epoch in range(epoch_start, n_iter + epoch_start):
|
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||||
for batch_id, batch in enumerate(
|
for batch_id, batch in enumerate(
|
||||||
util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size)
|
util.minibatch_by_words(
|
||||||
|
(Example(doc=text) for text in texts), size=batch_size
|
||||||
|
)
|
||||||
):
|
):
|
||||||
docs, count = make_docs(
|
docs, count = make_docs(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -246,7 +245,7 @@ def pretrain(
|
||||||
# Reshuffle the texts if texts were loaded from a file
|
# Reshuffle the texts if texts were loaded from a file
|
||||||
random.shuffle(texts)
|
random.shuffle(texts)
|
||||||
if skip_counter > 0:
|
if skip_counter > 0:
|
||||||
msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
|
msg.warn(f"Skipped {skip_counter} empty values")
|
||||||
msg.good("Successfully finished pretrain")
|
msg.good("Successfully finished pretrain")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, division, print_function
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
import tqdm
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -34,11 +31,11 @@ def profile(model, inputs=None, n_texts=10000):
|
||||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||||
inputs, _ = zip(*imdb_train)
|
inputs, _ = zip(*imdb_train)
|
||||||
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||||
inputs = inputs[:n_inputs]
|
inputs = inputs[:n_inputs]
|
||||||
with msg.loading("Loading model '{}'...".format(model)):
|
with msg.loading(f"Loading model '{model}'..."):
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good("Loaded model '{}'".format(model))
|
msg.good(f"Loaded model '{model}'")
|
||||||
texts = list(itertools.islice(inputs, n_texts))
|
texts = list(itertools.islice(inputs, n_texts))
|
||||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||||
s = pstats.Stats("Profile.prof")
|
s = pstats.Stats("Profile.prof")
|
||||||
|
@ -60,7 +57,7 @@ def _read_inputs(loc, msg):
|
||||||
input_path = Path(loc)
|
input_path = Path(loc)
|
||||||
if not input_path.exists() or not input_path.is_file():
|
if not input_path.exists() or not input_path.is_file():
|
||||||
msg.fail("Not a valid input data file", loc, exits=1)
|
msg.fail("Not a valid input data file", loc, exits=1)
|
||||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
msg.info(f"Using data from {input_path.parts[-1]}")
|
||||||
file_ = input_path.open()
|
file_ = input_path.open()
|
||||||
for line in file_:
|
for line in file_:
|
||||||
data = srsly.json_loads(line)
|
data = srsly.json_loads(line)
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, division, print_function
|
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import os
|
import os
|
||||||
import tqdm
|
import tqdm
|
||||||
|
@ -12,12 +9,10 @@ import srsly
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import contextlib
|
import contextlib
|
||||||
import random
|
import random
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
from .._ml import create_default_optimizer
|
from .._ml import create_default_optimizer
|
||||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..compat import path2str
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -148,14 +143,14 @@ def train(
|
||||||
# the model and make sure the pipeline matches the pipeline setting. If
|
# the model and make sure the pipeline matches the pipeline setting. If
|
||||||
# training starts from a blank model, intitalize the language class.
|
# training starts from a blank model, intitalize the language class.
|
||||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||||
msg.text("Training pipeline: {}".format(pipeline))
|
msg.text(f"Training pipeline: {pipeline}")
|
||||||
if base_model:
|
if base_model:
|
||||||
msg.text("Starting with base model '{}'".format(base_model))
|
msg.text(f"Starting with base model '{base_model}'")
|
||||||
nlp = util.load_model(base_model)
|
nlp = util.load_model(base_model)
|
||||||
if nlp.lang != lang:
|
if nlp.lang != lang:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Model language ('{}') doesn't match language specified as "
|
f"Model language ('{nlp.lang}') doesn't match language "
|
||||||
"`lang` argument ('{}') ".format(nlp.lang, lang),
|
f"specified as `lang` argument ('{lang}') ",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
||||||
|
@ -187,15 +182,13 @@ def train(
|
||||||
}
|
}
|
||||||
if base_cfg != pipe_cfg:
|
if base_cfg != pipe_cfg:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"The base textcat model configuration does"
|
f"The base textcat model configuration does"
|
||||||
"not match the provided training options. "
|
f"not match the provided training options. "
|
||||||
"Existing cfg: {}, provided cfg: {}".format(
|
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
||||||
base_cfg, pipe_cfg
|
|
||||||
),
|
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.text("Starting with blank model '{}'".format(lang))
|
msg.text(f"Starting with blank model '{lang}'")
|
||||||
lang_cls = util.get_lang_class(lang)
|
lang_cls = util.get_lang_class(lang)
|
||||||
nlp = lang_cls()
|
nlp = lang_cls()
|
||||||
for pipe in pipeline:
|
for pipe in pipeline:
|
||||||
|
@ -215,7 +208,7 @@ def train(
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
|
||||||
if vectors:
|
if vectors:
|
||||||
msg.text("Loading vector from model '{}'".format(vectors))
|
msg.text(f"Loading vector from model '{vectors}'")
|
||||||
_load_vectors(nlp, vectors)
|
_load_vectors(nlp, vectors)
|
||||||
|
|
||||||
# Multitask objectives
|
# Multitask objectives
|
||||||
|
@ -224,15 +217,15 @@ def train(
|
||||||
if multitasks:
|
if multitasks:
|
||||||
if pipe_name not in pipeline:
|
if pipe_name not in pipeline:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't use multitask objective without '{}' in the "
|
f"Can't use multitask objective without '{pipe_name}' in "
|
||||||
"pipeline".format(pipe_name)
|
f"the pipeline"
|
||||||
)
|
)
|
||||||
pipe = nlp.get_pipe(pipe_name)
|
pipe = nlp.get_pipe(pipe_name)
|
||||||
for objective in multitasks.split(","):
|
for objective in multitasks.split(","):
|
||||||
pipe.add_multitask_objective(objective)
|
pipe.add_multitask_objective(objective)
|
||||||
|
|
||||||
# Prepare training corpus
|
# Prepare training corpus
|
||||||
msg.text("Counting training words (limit={})".format(n_examples))
|
msg.text(f"Counting training words (limit={n_examples})")
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||||
n_train_words = corpus.count_train()
|
n_train_words = corpus.count_train()
|
||||||
|
|
||||||
|
@ -248,22 +241,22 @@ def train(
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
||||||
|
|
||||||
# Verify textcat config
|
# Verify textcat config
|
||||||
if "textcat" in pipeline:
|
if "textcat" in pipeline:
|
||||||
textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
|
textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
|
||||||
if textcat_positive_label and textcat_positive_label not in textcat_labels:
|
if textcat_positive_label and textcat_positive_label not in textcat_labels:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"The textcat_positive_label (tpl) '{}' does not match any "
|
f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
|
||||||
"label in the training data.".format(textcat_positive_label),
|
f"does not match any label in the training data.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
if textcat_positive_label and len(textcat_labels) != 2:
|
if textcat_positive_label and len(textcat_labels) != 2:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"A textcat_positive_label (tpl) '{}' was provided for training "
|
"A textcat_positive_label (tpl) '{textcat_positive_label}' was "
|
||||||
"data that does not appear to be a binary classification "
|
"provided for training data that does not appear to be a "
|
||||||
"problem with two labels.".format(textcat_positive_label),
|
"binary classification problem with two labels.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
train_data = corpus.train_data(
|
train_data = corpus.train_data(
|
||||||
|
@ -302,20 +295,20 @@ def train(
|
||||||
break
|
break
|
||||||
if base_model and set(textcat_labels) != train_labels:
|
if base_model and set(textcat_labels) != train_labels:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Cannot extend textcat model using data with different "
|
f"Cannot extend textcat model using data with different "
|
||||||
"labels. Base model labels: {}, training data labels: "
|
f"labels. Base model labels: {textcat_labels}, training data "
|
||||||
"{}.".format(textcat_labels, list(train_labels)),
|
f"labels: {list(train_labels)}",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
if textcat_multilabel:
|
if textcat_multilabel:
|
||||||
msg.text(
|
msg.text(
|
||||||
"Textcat evaluation score: ROC AUC score macro-averaged across "
|
f"Textcat evaluation score: ROC AUC score macro-averaged across "
|
||||||
"the labels '{}'".format(", ".join(textcat_labels))
|
f"the labels '{', '.join(textcat_labels)}'"
|
||||||
)
|
)
|
||||||
elif textcat_positive_label and len(textcat_labels) == 2:
|
elif textcat_positive_label and len(textcat_labels) == 2:
|
||||||
msg.text(
|
msg.text(
|
||||||
"Textcat evaluation score: F1-score for the "
|
f"Textcat evaluation score: F1-score for the "
|
||||||
"label '{}'".format(textcat_positive_label)
|
f"label '{textcat_positive_label}'"
|
||||||
)
|
)
|
||||||
elif len(textcat_labels) > 1:
|
elif len(textcat_labels) > 1:
|
||||||
if len(textcat_labels) == 2:
|
if len(textcat_labels) == 2:
|
||||||
|
@ -325,8 +318,8 @@ def train(
|
||||||
"an evaluation on the positive class."
|
"an evaluation on the positive class."
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
"Textcat evaluation score: F1-score macro-averaged across "
|
f"Textcat evaluation score: F1-score macro-averaged across "
|
||||||
"the labels '{}'".format(", ".join(textcat_labels))
|
f"the labels '{', '.join(textcat_labels)}'"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
@ -471,8 +464,8 @@ def train(
|
||||||
for cat, cat_score in textcats_per_cat.items():
|
for cat, cat_score in textcats_per_cat.items():
|
||||||
if cat_score.get("roc_auc_score", 0) < 0:
|
if cat_score.get("roc_auc_score", 0) < 0:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Textcat ROC AUC score is undefined due to "
|
f"Textcat ROC AUC score is undefined due to "
|
||||||
"only one value in label '{}'.".format(cat)
|
f"only one value in label '{cat}'."
|
||||||
)
|
)
|
||||||
msg.row(progress, **row_settings)
|
msg.row(progress, **row_settings)
|
||||||
# Early stopping
|
# Early stopping
|
||||||
|
@ -485,12 +478,10 @@ def train(
|
||||||
best_score = current_score
|
best_score = current_score
|
||||||
if iter_since_best >= n_early_stopping:
|
if iter_since_best >= n_early_stopping:
|
||||||
msg.text(
|
msg.text(
|
||||||
"Early stopping, best iteration "
|
f"Early stopping, best iteration is: {i - iter_since_best}"
|
||||||
"is: {}".format(i - iter_since_best)
|
|
||||||
)
|
)
|
||||||
msg.text(
|
msg.text(
|
||||||
"Best score = {}; Final iteration "
|
f"Best score = {best_score}; Final iteration score = {current_score}"
|
||||||
"score = {}".format(best_score, current_score)
|
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
finally:
|
finally:
|
||||||
|
@ -560,11 +551,11 @@ def _collate_best_model(meta, output_path, components):
|
||||||
for component in components:
|
for component in components:
|
||||||
bests[component] = _find_best(output_path, component)
|
bests[component] = _find_best(output_path, component)
|
||||||
best_dest = output_path / "model-best"
|
best_dest = output_path / "model-best"
|
||||||
shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
|
shutil.copytree(str(output_path / "model-final"), str(best_dest))
|
||||||
for component, best_component_src in bests.items():
|
for component, best_component_src in bests.items():
|
||||||
shutil.rmtree(path2str(best_dest / component))
|
shutil.rmtree(str(best_dest / component))
|
||||||
shutil.copytree(
|
shutil.copytree(
|
||||||
path2str(best_component_src / component), path2str(best_dest / component)
|
str(best_component_src / component), str(best_dest / component)
|
||||||
)
|
)
|
||||||
accs = srsly.read_json(best_component_src / "accuracy.json")
|
accs = srsly.read_json(best_component_src / "accuracy.json")
|
||||||
for metric in _get_metrics(component):
|
for metric in _get_metrics(component):
|
||||||
|
@ -627,10 +618,8 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
|
||||||
if has_beam_widths:
|
if has_beam_widths:
|
||||||
row_head.insert(1, "Beam W.")
|
row_head.insert(1, "Beam W.")
|
||||||
# remove duplicates
|
# remove duplicates
|
||||||
row_head_dict = OrderedDict()
|
row_head_dict = {k: 1 for k in row_head}
|
||||||
row_head_dict.update({k: 1 for k in row_head})
|
output_stats_dict = {k: 1 for k in output_stats}
|
||||||
output_stats_dict = OrderedDict()
|
|
||||||
output_stats_dict.update({k: 1 for k in output_stats})
|
|
||||||
return row_head_dict.keys(), output_stats_dict.keys()
|
return row_head_dict.keys(), output_stats_dict.keys()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import requests
|
import requests
|
||||||
import srsly
|
import srsly
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ..compat import path2str
|
|
||||||
from ..util import get_data_path
|
from ..util import get_data_path
|
||||||
from .. import about
|
from .. import about
|
||||||
|
|
||||||
|
@ -21,7 +17,7 @@ def validate():
|
||||||
r = requests.get(about.__compatibility__)
|
r = requests.get(about.__compatibility__)
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Server error ({})".format(r.status_code),
|
f"Server error ({r.status_code})",
|
||||||
"Couldn't fetch compatibility table.",
|
"Couldn't fetch compatibility table.",
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -32,7 +28,7 @@ def validate():
|
||||||
current_compat = compat.get(version)
|
current_compat = compat.get(version)
|
||||||
if not current_compat:
|
if not current_compat:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
"Can't find spaCy v{} in compatibility table".format(version),
|
f"Can't find spaCy v{version} in compatibility table",
|
||||||
about.__compatibility__,
|
about.__compatibility__,
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
|
@ -52,8 +48,8 @@ def validate():
|
||||||
update_models = [m for m in incompat_models if m in current_compat]
|
update_models = [m for m in incompat_models if m in current_compat]
|
||||||
spacy_dir = Path(__file__).parent.parent
|
spacy_dir = Path(__file__).parent.parent
|
||||||
|
|
||||||
msg.divider("Installed models (spaCy v{})".format(about.__version__))
|
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
||||||
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
msg.info(f"spaCy installation: {spacy_dir}")
|
||||||
|
|
||||||
if model_links or model_pkgs:
|
if model_links or model_pkgs:
|
||||||
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||||
|
@ -72,15 +68,15 @@ def validate():
|
||||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||||
if na_models:
|
if na_models:
|
||||||
msg.text(
|
msg.text(
|
||||||
"The following models are not available for spaCy "
|
f"The following models are not available for spaCy "
|
||||||
"v{}: {}".format(about.__version__, ", ".join(na_models))
|
f"v{about.__version__}: {', '.join(na_models)}"
|
||||||
)
|
)
|
||||||
if incompat_links:
|
if incompat_links:
|
||||||
msg.text(
|
msg.text(
|
||||||
"You may also want to overwrite the incompatible links using the "
|
f"You may also want to overwrite the incompatible links using the "
|
||||||
"`python -m spacy link` command with `--force`, or remove them "
|
f"`python -m spacy link` command with `--force`, or remove them "
|
||||||
"from the data directory. "
|
f"from the data directory. "
|
||||||
"Data path: {path}".format(path=path2str(get_data_path()))
|
f"Data path: {get_data_path()}"
|
||||||
)
|
)
|
||||||
if incompat_models or incompat_links:
|
if incompat_models or incompat_links:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -128,7 +124,7 @@ def get_model_row(compat, name, data, msg, model_type="package"):
|
||||||
version = msg.text(data["version"], color="green", no_print=True)
|
version = msg.text(data["version"], color="green", no_print=True)
|
||||||
else:
|
else:
|
||||||
version = msg.text(data["version"], color="red", no_print=True)
|
version = msg.text(data["version"], color="red", no_print=True)
|
||||||
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||||
return (model_type, name, data["name"], version, comp)
|
return (model_type, name, data["name"], version, comp)
|
||||||
|
|
||||||
|
|
||||||
|
|
102
spacy/compat.py
102
spacy/compat.py
|
@ -1,4 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
"""
|
"""
|
||||||
Helpers for Python and platform compatibility. To distinguish them from
|
Helpers for Python and platform compatibility. To distinguish them from
|
||||||
the builtin functions, replacement functions are suffixed with an underscore,
|
the builtin functions, replacement functions are suffixed with an underscore,
|
||||||
|
@ -6,13 +5,8 @@ e.g. `unicode_`.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#compat
|
DOCS: https://spacy.io/api/top-level#compat
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
|
||||||
import ast
|
|
||||||
import types
|
|
||||||
|
|
||||||
from thinc.neural.util import copy_array
|
from thinc.neural.util import copy_array
|
||||||
|
|
||||||
|
@ -46,45 +40,11 @@ copy_reg = copy_reg
|
||||||
CudaStream = CudaStream
|
CudaStream = CudaStream
|
||||||
cupy = cupy
|
cupy = cupy
|
||||||
copy_array = copy_array
|
copy_array = copy_array
|
||||||
izip = getattr(itertools, "izip", zip)
|
|
||||||
|
|
||||||
is_windows = sys.platform.startswith("win")
|
is_windows = sys.platform.startswith("win")
|
||||||
is_linux = sys.platform.startswith("linux")
|
is_linux = sys.platform.startswith("linux")
|
||||||
is_osx = sys.platform == "darwin"
|
is_osx = sys.platform == "darwin"
|
||||||
|
|
||||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
|
||||||
is_python2 = sys.version_info[0] == 2
|
|
||||||
is_python3 = sys.version_info[0] == 3
|
|
||||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
|
||||||
|
|
||||||
if is_python2:
|
|
||||||
bytes_ = str
|
|
||||||
unicode_ = unicode # noqa: F821
|
|
||||||
basestring_ = basestring # noqa: F821
|
|
||||||
input_ = raw_input # noqa: F821
|
|
||||||
path2str = lambda path: str(path).decode("utf8")
|
|
||||||
class_types = (type, types.ClassType)
|
|
||||||
|
|
||||||
elif is_python3:
|
|
||||||
bytes_ = bytes
|
|
||||||
unicode_ = str
|
|
||||||
basestring_ = str
|
|
||||||
input_ = input
|
|
||||||
path2str = lambda path: str(path)
|
|
||||||
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
|
|
||||||
|
|
||||||
|
|
||||||
def b_to_str(b_str):
|
|
||||||
"""Convert a bytes object to a string.
|
|
||||||
|
|
||||||
b_str (bytes): The object to convert.
|
|
||||||
RETURNS (unicode): The converted string.
|
|
||||||
"""
|
|
||||||
if is_python2:
|
|
||||||
return b_str
|
|
||||||
# Important: if no encoding is set, string becomes "b'...'"
|
|
||||||
return str(b_str, encoding="utf8")
|
|
||||||
|
|
||||||
|
|
||||||
def symlink_to(orig, dest):
|
def symlink_to(orig, dest):
|
||||||
"""Create a symlink. Used for model shortcut links.
|
"""Create a symlink. Used for model shortcut links.
|
||||||
|
@ -95,9 +55,7 @@ def symlink_to(orig, dest):
|
||||||
if is_windows:
|
if is_windows:
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
subprocess.check_call(
|
subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True)
|
||||||
["mklink", "/d", path2str(orig), path2str(dest)], shell=True
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
orig.symlink_to(dest)
|
orig.symlink_to(dest)
|
||||||
|
|
||||||
|
@ -108,19 +66,17 @@ def symlink_remove(link):
|
||||||
link (unicode / Path): The path to the symlink.
|
link (unicode / Path): The path to the symlink.
|
||||||
"""
|
"""
|
||||||
# https://stackoverflow.com/q/26554135/6400719
|
# https://stackoverflow.com/q/26554135/6400719
|
||||||
if os.path.isdir(path2str(link)) and is_windows:
|
if os.path.isdir(str(link)) and is_windows:
|
||||||
# this should only be on Py2.7 and windows
|
# this should only be on Py2.7 and windows
|
||||||
os.rmdir(path2str(link))
|
os.rmdir(str(link))
|
||||||
else:
|
else:
|
||||||
os.unlink(path2str(link))
|
os.unlink(str(link))
|
||||||
|
|
||||||
|
|
||||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
def is_config(windows=None, linux=None, osx=None, **kwargs):
|
||||||
"""Check if a specific configuration of Python version and operating system
|
"""Check if a specific configuration of Python version and operating system
|
||||||
matches the user's setup. Mostly used to display targeted error messages.
|
matches the user's setup. Mostly used to display targeted error messages.
|
||||||
|
|
||||||
python2 (bool): spaCy is executed with Python 2.x.
|
|
||||||
python3 (bool): spaCy is executed with Python 3.x.
|
|
||||||
windows (bool): spaCy is executed on Windows.
|
windows (bool): spaCy is executed on Windows.
|
||||||
linux (bool): spaCy is executed on Linux.
|
linux (bool): spaCy is executed on Linux.
|
||||||
osx (bool): spaCy is executed on OS X or macOS.
|
osx (bool): spaCy is executed on OS X or macOS.
|
||||||
|
@ -129,53 +85,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||||
DOCS: https://spacy.io/api/top-level#compat.is_config
|
DOCS: https://spacy.io/api/top-level#compat.is_config
|
||||||
"""
|
"""
|
||||||
return (
|
return (
|
||||||
python2 in (None, is_python2)
|
windows in (None, is_windows)
|
||||||
and python3 in (None, is_python3)
|
|
||||||
and windows in (None, is_windows)
|
|
||||||
and linux in (None, is_linux)
|
and linux in (None, is_linux)
|
||||||
and osx in (None, is_osx)
|
and osx in (None, is_osx)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def import_file(name, loc):
|
|
||||||
"""Import module from a file. Used to load models from a directory.
|
|
||||||
|
|
||||||
name (unicode): Name of module to load.
|
|
||||||
loc (unicode / Path): Path to the file.
|
|
||||||
RETURNS: The loaded module.
|
|
||||||
"""
|
|
||||||
loc = path2str(loc)
|
|
||||||
if is_python_pre_3_5:
|
|
||||||
import imp
|
|
||||||
|
|
||||||
return imp.load_source(name, loc)
|
|
||||||
else:
|
|
||||||
import importlib.util
|
|
||||||
|
|
||||||
spec = importlib.util.spec_from_file_location(name, str(loc))
|
|
||||||
module = importlib.util.module_from_spec(spec)
|
|
||||||
spec.loader.exec_module(module)
|
|
||||||
return module
|
|
||||||
|
|
||||||
|
|
||||||
def unescape_unicode(string):
|
|
||||||
"""Python2.7's re module chokes when compiling patterns that have ranges
|
|
||||||
between escaped unicode codepoints if the two codepoints are unrecognised
|
|
||||||
in the unicode database. For instance:
|
|
||||||
|
|
||||||
re.compile('[\\uAA77-\\uAA79]').findall("hello")
|
|
||||||
|
|
||||||
Ends up matching every character (on Python 2). This problem doesn't occur
|
|
||||||
if we're dealing with unicode literals.
|
|
||||||
"""
|
|
||||||
if string is None:
|
|
||||||
return string
|
|
||||||
# We only want to unescape the unicode, so we first must protect the other
|
|
||||||
# backslashes.
|
|
||||||
string = string.replace("\\", "\\\\")
|
|
||||||
# Now we remove that protection for the unicode.
|
|
||||||
string = string.replace("\\\\u", "\\u")
|
|
||||||
string = string.replace("\\\\U", "\\U")
|
|
||||||
# Now we unescape by evaling the string with the AST. This can't execute
|
|
||||||
# code -- it only does the representational level.
|
|
||||||
return ast.literal_eval("u'''" + string + "'''")
|
|
||||||
|
|
|
@ -1,15 +1,11 @@
|
||||||
# coding: utf8
|
|
||||||
"""
|
"""
|
||||||
spaCy's built in visualization suite for dependencies and named entities.
|
spaCy's built in visualization suite for dependencies and named entities.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer
|
from .render import DependencyRenderer, EntityRenderer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..compat import b_to_str
|
|
||||||
from ..errors import Errors, Warnings, user_warning
|
from ..errors import Errors, Warnings, user_warning
|
||||||
from ..util import is_in_jupyter
|
from ..util import is_in_jupyter
|
||||||
|
|
||||||
|
@ -92,20 +88,20 @@ def serve(
|
||||||
|
|
||||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||||
httpd = simple_server.make_server(host, port, app)
|
httpd = simple_server.make_server(host, port, app)
|
||||||
print("\nUsing the '{}' visualizer".format(style))
|
print(f"\nUsing the '{style}' visualizer")
|
||||||
print("Serving on http://{}:{} ...\n".format(host, port))
|
print(f"Serving on http://{host}:{port} ...\n")
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("Shutting down server on port {}.".format(port))
|
print(f"Shutting down server on port {port}.")
|
||||||
finally:
|
finally:
|
||||||
httpd.server_close()
|
httpd.server_close()
|
||||||
|
|
||||||
|
|
||||||
def app(environ, start_response):
|
def app(environ, start_response):
|
||||||
# Headers and status need to be bytes in Python 2, see #1227
|
# Headers and status need to be bytes in Python 2, see #1227
|
||||||
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
headers = [("Content-type", "text/html; charset=utf-8")]
|
||||||
start_response(b_to_str(b"200 OK"), headers)
|
start_response("200 OK", headers)
|
||||||
res = _html["parsed"].encode(encoding="utf-8")
|
res = _html["parsed"].encode(encoding="utf-8")
|
||||||
return [res]
|
return [res]
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
||||||
|
@ -55,7 +52,7 @@ class DependencyRenderer(object):
|
||||||
settings = p.get("settings", {})
|
settings = p.get("settings", {})
|
||||||
self.direction = settings.get("direction", DEFAULT_DIR)
|
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||||
self.lang = settings.get("lang", DEFAULT_LANG)
|
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||||
render_id = "{}-{}".format(id_prefix, i)
|
render_id = f"{id_prefix}-{i}"
|
||||||
svg = self.render_svg(render_id, p["words"], p["arcs"])
|
svg = self.render_svg(render_id, p["words"], p["arcs"])
|
||||||
rendered.append(svg)
|
rendered.append(svg)
|
||||||
if page:
|
if page:
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Setting explicit height and max-width: none on the SVG is required for
|
# Setting explicit height and max-width: none on the SVG is required for
|
||||||
# Jupyter to render it properly in a cell
|
# Jupyter to render it properly in a cell
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
import inspect
|
import inspect
|
||||||
|
@ -12,7 +9,7 @@ def add_codes(err_cls):
|
||||||
class ErrorsWithCodes(object):
|
class ErrorsWithCodes(object):
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = getattr(err_cls, code)
|
msg = getattr(err_cls, code)
|
||||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
return f"[{code}] {msg}"
|
||||||
|
|
||||||
return ErrorsWithCodes()
|
return ErrorsWithCodes()
|
||||||
|
|
||||||
|
@ -98,8 +95,6 @@ class Warnings(object):
|
||||||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||||
"package installed.")
|
"package installed.")
|
||||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
|
||||||
"'n_process' will be set to 1.")
|
|
||||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||||
"the Knowledge Base.")
|
"the Knowledge Base.")
|
||||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||||
|
@ -550,6 +545,7 @@ class Errors(object):
|
||||||
E999 = ("Encountered an unexpected format for the dictionary holding "
|
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||||
"gold annotations: {gold_dict}")
|
"gold annotations: {gold_dict}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||||
|
@ -573,10 +569,10 @@ class MatchPatternError(ValueError):
|
||||||
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
||||||
ID, i.e. the index of the added pattern.
|
ID, i.e. the index of the added pattern.
|
||||||
"""
|
"""
|
||||||
msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
|
msg = f"Invalid token patterns for matcher rule '{key}'\n"
|
||||||
for pattern_idx, error_msgs in errors.items():
|
for pattern_idx, error_msgs in errors.items():
|
||||||
pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
|
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
||||||
msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
|
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
||||||
ValueError.__init__(self, msg)
|
ValueError.__init__(self, msg)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def explain(term):
|
def explain(term):
|
||||||
"""Get a description for a given POS tag, dependency label or entity type.
|
"""Get a description for a given POS tag, dependency label or entity type.
|
||||||
|
|
|
@ -1,7 +1,4 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals, print_function
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -14,7 +11,6 @@ import srsly
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc, Span
|
from .tokens import Doc, Span
|
||||||
from .errors import Errors, AlignmentError, user_warning, Warnings
|
from .errors import Errors, AlignmentError, user_warning, Warnings
|
||||||
from .compat import path2str, basestring_
|
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -157,7 +153,7 @@ class GoldCorpus(object):
|
||||||
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
|
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
shutil.rmtree(path2str(self.tmp_dir))
|
shutil.rmtree(self.tmp_dir)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_msgpack(directory, examples, limit=0):
|
def write_msgpack(directory, examples, limit=0):
|
||||||
|
@ -167,7 +163,7 @@ class GoldCorpus(object):
|
||||||
for i, example in enumerate(examples):
|
for i, example in enumerate(examples):
|
||||||
ex_dict = example.to_dict()
|
ex_dict = example.to_dict()
|
||||||
text = example.text
|
text = example.text
|
||||||
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
|
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
|
||||||
n += 1
|
n += 1
|
||||||
if limit and n >= limit:
|
if limit and n >= limit:
|
||||||
break
|
break
|
||||||
|
@ -221,7 +217,7 @@ class GoldCorpus(object):
|
||||||
examples = [Example.from_dict(ex_dict, doc=text)]
|
examples = [Example.from_dict(ex_dict, doc=text)]
|
||||||
else:
|
else:
|
||||||
supported = ("json", "jsonl", "msg")
|
supported = ("json", "jsonl", "msg")
|
||||||
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
raise ValueError(Errors.E124.format(path=loc, formats=supported))
|
||||||
for example in examples:
|
for example in examples:
|
||||||
yield example
|
yield example
|
||||||
i += 1
|
i += 1
|
||||||
|
@ -862,7 +858,7 @@ cdef class Example:
|
||||||
converted_examples = []
|
converted_examples = []
|
||||||
for ex in examples:
|
for ex in examples:
|
||||||
# convert string to Doc to Example
|
# convert string to Doc to Example
|
||||||
if isinstance(ex, basestring_):
|
if isinstance(ex, str):
|
||||||
if keep_raw_text:
|
if keep_raw_text:
|
||||||
converted_examples.append(Example(doc=ex))
|
converted_examples.append(Example(doc=ex))
|
||||||
else:
|
else:
|
||||||
|
@ -876,7 +872,7 @@ cdef class Example:
|
||||||
doc, gold = ex
|
doc, gold = ex
|
||||||
gold_dict = {}
|
gold_dict = {}
|
||||||
# convert string to Doc
|
# convert string to Doc
|
||||||
if isinstance(doc, basestring_) and not keep_raw_text:
|
if isinstance(doc, str) and not keep_raw_text:
|
||||||
doc = make_doc(doc)
|
doc = make_doc(doc)
|
||||||
# convert dict to GoldParse
|
# convert dict to GoldParse
|
||||||
if isinstance(gold, dict):
|
if isinstance(gold, dict):
|
||||||
|
@ -988,7 +984,7 @@ cdef class GoldParse:
|
||||||
# Translate the None values to '-', to make processing easier.
|
# Translate the None values to '-', to make processing easier.
|
||||||
# See Issue #2603
|
# See Issue #2603
|
||||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||||
if not isinstance(entities[0], basestring_):
|
if not isinstance(entities[0], str):
|
||||||
# Assume we have entities specified by character offset.
|
# Assume we have entities specified by character offset.
|
||||||
entities = biluo_tags_from_offsets(doc, entities)
|
entities = biluo_tags_from_offsets(doc, entities)
|
||||||
|
|
||||||
|
@ -1107,7 +1103,7 @@ cdef class GoldParse:
|
||||||
cycle = nonproj.contains_cycle(self.heads)
|
cycle = nonproj.contains_cycle(self.heads)
|
||||||
if cycle is not None:
|
if cycle is not None:
|
||||||
raise ValueError(Errors.E069.format(cycle=cycle,
|
raise ValueError(Errors.E069.format(cycle=cycle,
|
||||||
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
|
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
|
||||||
doc_tokens=" ".join(words[:50])))
|
doc_tokens=" ".join(words[:50])))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
|
13
spacy/kb.pyx
13
spacy/kb.pyx
|
@ -1,22 +1,17 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
# coding: utf8
|
|
||||||
from spacy.errors import Errors, Warnings, user_warning
|
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
|
||||||
|
|
||||||
from os import path
|
from os import path
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
|
from .typedefs cimport hash_t
|
||||||
|
from .errors import Errors, Warnings, user_warning
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||||
|
@ -584,5 +579,3 @@ cdef class Reader:
|
||||||
cdef int _read(self, void* value, size_t size) except -1:
|
cdef int _read(self, void* value, size_t size) except -1:
|
||||||
status = fread(value, size, 1, self._fp)
|
status = fread(value, size, 1, self._fp)
|
||||||
return status
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Source: https://github.com/stopwords-iso/stopwords-af
|
# Source: https://github.com/stopwords-iso/stopwords-af
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = set(
|
_num_words = set(
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
from ..char_classes import UNITS, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
من
|
من
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
# Source: https://github.com/Alir3z4/stop-words
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import LEMMA, PRON_LEMMA
|
from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||||
from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding=utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES
|
from ..punctuation import TOKENIZER_INFIXES
|
||||||
from ..char_classes import ALPHA
|
from ..char_classes import ALPHA
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA
|
from ...symbols import ORTH, LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
split_chars = lambda char: list(char.strip().split(" "))
|
split_chars = lambda char: list(char.strip().split(" "))
|
||||||
merge_chars = lambda char: char.strip().replace(" ", "|")
|
merge_chars = lambda char: char.strip().replace(" ", "|")
|
||||||
group_chars = lambda char: char.strip().replace(" ", "")
|
group_chars = lambda char: char.strip().replace(" ", "")
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Source: https://github.com/Alir3z4/stop-words
|
# Source: https://github.com/Alir3z4/stop-words
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import LEMMA, PRON_LEMMA
|
from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
# Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php
|
# Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php
|
||||||
|
|
|
@ -1,10 +1,7 @@
|
||||||
# coding: utf8
|
|
||||||
"""
|
"""
|
||||||
Special-case rules for normalizing tokens to improve the model's predictions.
|
Special-case rules for normalizing tokens to improve the model's predictions.
|
||||||
For example 'mysterium' vs 'mysterie' and similar.
|
For example 'mysterium' vs 'mysterie' and similar.
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Sources:
|
# Sources:
|
||||||
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
|
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# encoding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# Source: Handpicked by Jens Dahl Møllerhøj.
|
# Source: Handpicked by Jens Dahl Møllerhøj.
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
# encoding: utf8
|
|
||||||
"""
|
"""
|
||||||
Tokenizer Exceptions.
|
Tokenizer Exceptions.
|
||||||
Source: https://forkortelse.dk/ and various others.
|
Source: https://forkortelse.dk/ and various others.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
|
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
# Here we only want to include the absolute most common words. Otherwise,
|
# Here we only want to include the absolute most common words. Otherwise,
|
||||||
# this list would get impossibly long for German – especially considering the
|
# this list would get impossibly long for German – especially considering the
|
||||||
# old vs. new spelling rules, and all possible cases.
|
# old vs. new spelling rules, and all possible cases.
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
|
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
|
||||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB
|
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .tag_map_general import TAG_MAP
|
from .tag_map_general import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.el.examples import sentences
|
>>> from spacy.lang.el.examples import sentences
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
def get_pos_from_wiktionary():
|
def get_pos_from_wiktionary():
|
||||||
import re
|
import re
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
# Stop words
|
# Stop words
|
||||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
from ...symbols import NOUN, PROPN, PRON
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||||
from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX
|
from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||||
from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ
|
from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, NORM
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import LEMMA, PRON_LEMMA
|
from ...symbols import LEMMA, PRON_LEMMA
|
||||||
|
|
||||||
# Several entries here look pretty suspicious. These will get the POS SCONJ
|
# Several entries here look pretty suspicious. These will get the POS SCONJ
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
# Slang and abbreviations
|
# Slang and abbreviations
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user