mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Drop Python 2.7 and 3.5 (#4828)
* Remove unicode declarations * Remove Python 3.5 and 2.7 from CI * Don't require pathlib * Replace compat helpers * Remove OrderedDict * Use f-strings * Set Cython compiler language level * Fix typo * Re-add OrderedDict for Table * Update setup.cfg * Revert CONTRIBUTING.md * Revert lookups.md * Revert top-level.md * Small adjustments and docs [ci skip]
This commit is contained in:
parent
21b6d6e0a8
commit
db55577c45
23
.travis.yml
23
.travis.yml
|
@ -1,23 +0,0 @@
|
|||
language: python
|
||||
sudo: false
|
||||
cache: pip
|
||||
dist: trusty
|
||||
group: edge
|
||||
python:
|
||||
- "2.7"
|
||||
os:
|
||||
- linux
|
||||
install:
|
||||
- "pip install -r requirements.txt"
|
||||
- "python setup.py build_ext --inplace"
|
||||
- "pip install -e ."
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "python -m pytest --tb=native spacy"
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
||||
notifications:
|
||||
slack:
|
||||
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
|
||||
email: false
|
|
@ -280,23 +280,7 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written in an **intersection of Python 2 and Python 3**.
|
||||
This is easy in Cython, but somewhat ugly in Python. Logic that deals with
|
||||
Python or platform compatibility should only live in
|
||||
[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
|
||||
functions, replacement functions are suffixed with an underscore, for example
|
||||
`unicode_`. If you need to access the user's version or platform information,
|
||||
for example to show more specific error messages, you can use the `is_config()`
|
||||
helper function.
|
||||
|
||||
```python
|
||||
from .compat import unicode_, is_config
|
||||
|
||||
compatible_unicode = unicode_('hello world')
|
||||
if is_config(windows=True, python2=True):
|
||||
print("You are using Python 2 on Windows.")
|
||||
```
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**.
|
||||
Code that interacts with the file-system should accept objects that follow the
|
||||
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
||||
If the function is user-facing and takes a path as an argument, it should check
|
||||
|
|
|
@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
|
|||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
|
||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||
|
@ -98,7 +97,7 @@ For detailed installation instructions, see the
|
|||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
|
||||
- **Python version**: Python 3.6+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
[pip]: https://pypi.org/project/spacy/
|
||||
|
@ -269,9 +268,7 @@ and git preinstalled.
|
|||
Install a version of the
|
||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
||||
matches the version that was used to compile your Python interpreter. For
|
||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
|
||||
VS 2015 (Python 3.5).
|
||||
matches the version that was used to compile your Python interpreter.
|
||||
|
||||
## Run tests
|
||||
|
||||
|
|
|
@ -35,12 +35,6 @@ jobs:
|
|||
dependsOn: 'Validate'
|
||||
strategy:
|
||||
matrix:
|
||||
Python35Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.5'
|
||||
Python35Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.5'
|
||||
Python36Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.6'
|
||||
|
|
|
@ -38,14 +38,14 @@ import argparse
|
|||
HASH_FILE = "cythonize.json"
|
||||
|
||||
|
||||
def process_pyx(fromfile, tofile, language_level="-2"):
|
||||
def process_pyx(fromfile, tofile, language_level="-3"):
|
||||
print("Processing %s" % fromfile)
|
||||
try:
|
||||
from Cython.Compiler.Version import version as cython_version
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
||||
raise Exception("Require Cython >= 0.19")
|
||||
if LooseVersion(cython_version) < LooseVersion("0.25"):
|
||||
raise Exception("Require Cython >= 0.25")
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
3
fabfile.py
vendored
3
fabfile.py
vendored
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
from fabric.api import local, lcd, env, settings, prefix
|
||||
|
|
|
@ -11,7 +11,6 @@ catalogue>=0.0.7,<1.1.0
|
|||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
|
|
|
@ -16,10 +16,7 @@ classifiers =
|
|||
Operating System :: MacOS :: MacOS X
|
||||
Operating System :: Microsoft :: Windows
|
||||
Programming Language :: Cython
|
||||
Programming Language :: Python :: 2
|
||||
Programming Language :: Python :: 2.7
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3.5
|
||||
Programming Language :: Python :: 3.6
|
||||
Programming Language :: Python :: 3.7
|
||||
Programming Language :: Python :: 3.8
|
||||
|
@ -30,7 +27,7 @@ zip_safe = false
|
|||
include_package_data = true
|
||||
scripts =
|
||||
bin/spacy
|
||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*
|
||||
python_requires = >=3.6
|
||||
setup_requires =
|
||||
wheel
|
||||
cython>=0.25
|
||||
|
@ -54,7 +51,6 @@ install_requires =
|
|||
numpy>=1.15.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
|
|
1
setup.py
1
setup.py
|
@ -1,5 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import warnings
|
||||
import sys
|
||||
|
||||
|
|
|
@ -1,9 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
# from __future__ import unicode_literals
|
||||
|
||||
if __name__ == "__main__":
|
||||
import plac
|
||||
import sys
|
||||
|
@ -32,5 +26,5 @@ if __name__ == "__main__":
|
|||
if command in commands:
|
||||
plac.call(commands[command], sys.argv[1:])
|
||||
else:
|
||||
available = "Available: {}".format(", ".join(commands))
|
||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||
available = f"Available: {', '.join(commands)}"
|
||||
msg.fail(f"Unknown command: {command}", available, exits=1)
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
from wasabi import Printer
|
||||
|
||||
from .tokens import Doc, Token, Span
|
||||
|
@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
|||
assert pipeline[index][0] == name
|
||||
prev_pipes = pipeline[:index]
|
||||
pipe_requires = getattr(pipe, "requires", [])
|
||||
requires = OrderedDict([(annot, False) for annot in pipe_requires])
|
||||
requires = {annot: False for annot in pipe_requires}
|
||||
if requires:
|
||||
for prev_name, prev_pipe in prev_pipes:
|
||||
prev_assigns = getattr(prev_pipe, "assigns", [])
|
||||
|
@ -98,15 +94,15 @@ def validate_attrs(values):
|
|||
for ext_attr, ext_value in value.items():
|
||||
# We don't check whether the attribute actually exists
|
||||
if ext_value is not True: # attr is something like doc._.x.y
|
||||
good = "{}._.{}".format(obj_key, ext_attr)
|
||||
bad = "{}.{}".format(good, ".".join(ext_value))
|
||||
good = f"{obj_key}._.{ext_attr}"
|
||||
bad = f"{good}.{'.'.join(ext_value)}"
|
||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||
continue # we can't validate those further
|
||||
if attr.endswith("_"): # attr is something like "token.pos_"
|
||||
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
|
||||
if value is not True: # attr is something like doc.x.y
|
||||
good = "{}.{}".format(obj_key, attr)
|
||||
bad = "{}.{}".format(good, ".".join(value))
|
||||
good = f"{obj_key}.{attr}"
|
||||
bad = f"{good}.{'.'.join(value)}"
|
||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||
obj = objs[obj_key]
|
||||
if not hasattr(obj, attr):
|
||||
|
@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False):
|
|||
msg.table(overview, header=header, divider=True, multiline=True)
|
||||
n_problems = sum(len(p) for p in problems.values())
|
||||
if any(p for p in problems.values()):
|
||||
msg.divider("Problems ({})".format(n_problems))
|
||||
msg.divider(f"Problems ({n_problems})")
|
||||
for name, problem in problems.items():
|
||||
if problem:
|
||||
problem = ", ".join(problem)
|
||||
msg.warn("'{}' requirements not met: {}".format(name, problem))
|
||||
msg.warn(f"'{name}' requirements not met: {', '.join(problem)}")
|
||||
else:
|
||||
msg.good("No problems found.")
|
||||
if no_print:
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# NB: This schema describes the new format of the training data, see #2928
|
||||
TRAINING_SCHEMA = {
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
|
@ -30,16 +27,18 @@ FILE_TYPES_STDOUT = ("json", "jsonl")
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||
file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES),
|
||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path),
|
||||
ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,),
|
||||
# fmt: on
|
||||
)
|
||||
def convert(
|
||||
input_file,
|
||||
|
@ -62,16 +61,10 @@ def convert(
|
|||
no_print = output_dir == "-"
|
||||
msg = Printer(no_print=no_print)
|
||||
input_path = Path(input_file)
|
||||
if file_type not in FILE_TYPES:
|
||||
msg.fail(
|
||||
"Unknown file type: '{}'".format(file_type),
|
||||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
msg.fail(
|
||||
"Can't write .{} data to stdout.".format(file_type),
|
||||
f"Can't write .{file_type} data to stdout",
|
||||
"Please specify an output directory.",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -95,7 +88,7 @@ def convert(
|
|||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
ner_map = None
|
||||
if ner_map_path is not None:
|
||||
ner_map = srsly.read_json(ner_map_path)
|
||||
|
@ -113,7 +106,7 @@ def convert(
|
|||
)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
suffix = f".{file_type}"
|
||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||
if file_type == "json":
|
||||
srsly.write_json(output_file, data)
|
||||
|
@ -121,9 +114,7 @@ def convert(
|
|||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good(
|
||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
||||
)
|
||||
msg.good(f"Generated output file ({len(data)} documents): {output_file}")
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
@ -64,9 +61,9 @@ def conll_ner2json(
|
|||
# sentence segmentation required for document segmentation
|
||||
if n_sents > 0 and not seg_sents:
|
||||
msg.warn(
|
||||
"No sentence boundaries found to use with option `-n {}`. "
|
||||
"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
"to disable.".format(n_sents)
|
||||
f"No sentence boundaries found to use with option `-n {n_sents}`. "
|
||||
f"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
f"to disable."
|
||||
)
|
||||
else:
|
||||
n_sents_info(msg, n_sents)
|
||||
|
@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
if model:
|
||||
nlp = load_model(model)
|
||||
if "parser" in nlp.pipe_names:
|
||||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
||||
msg.info(f"Segmenting sentences with parser from model '{model}'.")
|
||||
sentencizer = nlp.get_pipe("parser")
|
||||
if not sentencizer:
|
||||
msg.info(
|
||||
|
@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
|
|||
|
||||
|
||||
def n_sents_info(msg, n_sents):
|
||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
||||
msg.info(f"Grouping every {n_sents} sentences into a document.")
|
||||
if n_sents == 1:
|
||||
msg.warn(
|
||||
"To generate better training data, you may want to group "
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from spacy.gold import Example
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import srsly
|
||||
|
||||
from ...gold import docs_to_json
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import plac
|
||||
|
@ -23,20 +20,17 @@ BLANK_MODEL_THRESHOLD = 2000
|
|||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
lang=("model language", "positional", None, str),
|
||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||
base_model=("name of model to update (optional)", "option", "b", str),
|
||||
pipeline=(
|
||||
"Comma-separated names of pipeline components to train",
|
||||
"option",
|
||||
"p",
|
||||
str,
|
||||
),
|
||||
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_data(
|
||||
lang,
|
||||
|
@ -93,15 +87,11 @@ def debug_data(
|
|||
corpus.train_dataset_without_preprocessing(nlp)
|
||||
)
|
||||
except ValueError as e:
|
||||
loading_train_error_message = "Training data cannot be loaded: {}".format(
|
||||
str(e)
|
||||
)
|
||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
||||
try:
|
||||
dev_dataset = list(corpus.dev_dataset(nlp))
|
||||
except ValueError as e:
|
||||
loading_dev_error_message = "Development data cannot be loaded: {}".format(
|
||||
str(e)
|
||||
)
|
||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
||||
if loading_train_error_message or loading_dev_error_message:
|
||||
if loading_train_error_message:
|
||||
msg.fail(loading_train_error_message)
|
||||
|
@ -112,78 +102,66 @@ def debug_data(
|
|||
|
||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||
gold_train_data = _compile_gold(train_dataset, pipeline)
|
||||
gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline)
|
||||
gold_train_unpreprocessed_data = _compile_gold(
|
||||
train_dataset_unpreprocessed, pipeline
|
||||
)
|
||||
gold_dev_data = _compile_gold(dev_dataset, pipeline)
|
||||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||
msg.fail(f"Pipeline component '{pipe}' not available in factories")
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
msg.text(f"Starting with base model '{base_model}'")
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
msg.text("{} training docs".format(len(train_dataset)))
|
||||
msg.text("{} evaluation docs".format(len(gold_dev_data)))
|
||||
msg.text(f"Starting with blank model '{lang}'")
|
||||
msg.text(f"{len(train_dataset)} training docs")
|
||||
msg.text(f"{len(gold_dev_data)} evaluation docs")
|
||||
|
||||
if not len(gold_dev_data):
|
||||
msg.fail("No evaluation docs")
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
msg.warn(f"{overlap} training examples also in evaluation data")
|
||||
else:
|
||||
msg.good("No overlap between training and evaluation data")
|
||||
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||
text = "Low number of examples to train from a blank model ({})".format(
|
||||
len(train_dataset)
|
||||
text = (
|
||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
||||
)
|
||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
msg.fail(text)
|
||||
else:
|
||||
msg.warn(text)
|
||||
msg.text(
|
||||
"It's recommended to use at least {} examples (minimum {})".format(
|
||||
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||
),
|
||||
f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
|
||||
f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
msg.divider("Vocab & Vectors")
|
||||
n_words = gold_train_data["n_words"]
|
||||
msg.info(
|
||||
"{} total {} in the data ({} unique)".format(
|
||||
n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
|
||||
)
|
||||
f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
|
||||
)
|
||||
if gold_train_data["n_misaligned_words"] > 0:
|
||||
msg.warn(
|
||||
"{} misaligned tokens in the training data".format(
|
||||
gold_train_data["n_misaligned_words"]
|
||||
)
|
||||
)
|
||||
n_misaligned = gold_train_data["n_misaligned_words"]
|
||||
msg.warn(f"{n_misaligned} misaligned tokens in the training data")
|
||||
if gold_dev_data["n_misaligned_words"] > 0:
|
||||
msg.warn(
|
||||
"{} misaligned tokens in the dev data".format(
|
||||
gold_dev_data["n_misaligned_words"]
|
||||
)
|
||||
)
|
||||
n_misaligned = gold_dev_data["n_misaligned_words"]
|
||||
msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
|
||||
most_common_words = gold_train_data["words"].most_common(10)
|
||||
msg.text(
|
||||
"10 most common words: {}".format(
|
||||
_format_labels(most_common_words, counts=True)
|
||||
),
|
||||
f"10 most common words: {_format_labels(most_common_words, counts=True)}",
|
||||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
msg.info(
|
||||
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||
len(nlp.vocab.vectors),
|
||||
nlp.vocab.vectors.n_keys,
|
||||
nlp.vocab.vectors_length,
|
||||
)
|
||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||
)
|
||||
else:
|
||||
msg.info("No word vectors present in the model")
|
||||
|
@ -203,19 +181,10 @@ def debug_data(
|
|||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(
|
||||
"{} new {}, {} existing {}".format(
|
||||
len(new_labels),
|
||||
"label" if len(new_labels) == 1 else "labels",
|
||||
len(existing_labels),
|
||||
"label" if len(existing_labels) == 1 else "labels",
|
||||
)
|
||||
f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
|
||||
)
|
||||
missing_values = label_counts["-"]
|
||||
msg.text(
|
||||
"{} missing {} (tokens with '-' label)".format(
|
||||
missing_values, "value" if missing_values == 1 else "values"
|
||||
)
|
||||
)
|
||||
msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
|
||||
for label in new_labels:
|
||||
if len(label) == 0:
|
||||
msg.fail("Empty label found in new labels")
|
||||
|
@ -226,33 +195,24 @@ def debug_data(
|
|||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||
if gold_train_data["ws_ents"]:
|
||||
msg.fail(
|
||||
"{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
|
||||
)
|
||||
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
|
||||
has_ws_ents_error = True
|
||||
|
||||
for label in new_labels:
|
||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for new label '{}' ({})".format(
|
||||
label, label_counts[label]
|
||||
)
|
||||
f"Low number of examples for new label '{label}' ({label_counts[label]})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||
if neg_docs == 0:
|
||||
msg.warn(
|
||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||
)
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
if not has_low_data_warning:
|
||||
|
@ -264,8 +224,8 @@ def debug_data(
|
|||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a new entity type, your data should include at "
|
||||
"least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||
f"To train a new entity type, your data should include at "
|
||||
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||
show=verbose,
|
||||
)
|
||||
if has_no_neg_warning:
|
||||
|
@ -288,27 +248,21 @@ def debug_data(
|
|||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
msg.info(
|
||||
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||
len(new_labels), len(existing_labels)
|
||||
)
|
||||
f"Text Classification: {len(new_labels)} new label(s), "
|
||||
f"{len(existing_labels)} existing label(s)"
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["cats"].most_common(), counts=True
|
||||
)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
||||
msg.fail(
|
||||
"The train and dev labels are not the same. "
|
||||
"Train labels: {}. "
|
||||
"Dev labels: {}.".format(
|
||||
_format_labels(gold_train_data["cats"]),
|
||||
_format_labels(gold_dev_data["cats"]),
|
||||
)
|
||||
f"The train and dev labels are not the same. "
|
||||
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
||||
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
||||
)
|
||||
if gold_train_data["n_cats_multilabel"] > 0:
|
||||
msg.info(
|
||||
|
@ -338,27 +292,16 @@ def debug_data(
|
|||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_train_data["tags"]]
|
||||
tag_map = nlp.vocab.morphology.tag_map
|
||||
msg.info(
|
||||
"{} {} in data ({} {} in tag map)".format(
|
||||
len(labels),
|
||||
"label" if len(labels) == 1 else "labels",
|
||||
len(tag_map),
|
||||
"label" if len(tag_map) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["tags"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
non_tagmap = [l for l in labels if l not in tag_map]
|
||||
if not non_tagmap:
|
||||
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
||||
for label in non_tagmap:
|
||||
msg.fail(
|
||||
"Label '{}' not found in tag map for language '{}'".format(
|
||||
label, nlp.lang
|
||||
)
|
||||
)
|
||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||
|
||||
if "parser" in pipeline:
|
||||
has_low_data_warning = False
|
||||
|
@ -366,21 +309,18 @@ def debug_data(
|
|||
|
||||
# profile sentence length
|
||||
msg.info(
|
||||
"Found {} sentence{} with an average length of {:.1f} words.".format(
|
||||
gold_train_data["n_sents"],
|
||||
"s" if len(train_dataset) > 1 else "",
|
||||
gold_train_data["n_words"] / gold_train_data["n_sents"],
|
||||
)
|
||||
f"Found {gold_train_data['n_sents']} sentence(s) with an average "
|
||||
f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
|
||||
)
|
||||
|
||||
# check for documents with multiple sentences
|
||||
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
|
||||
if sents_per_doc < 1.1:
|
||||
msg.warn(
|
||||
"The training data contains {:.2f} sentences per "
|
||||
"document. When there are very few documents containing more "
|
||||
"than one sentence, the parser will not learn how to segment "
|
||||
"longer texts into sentences.".format(sents_per_doc)
|
||||
f"The training data contains {sents_per_doc:.2f} sentences per "
|
||||
f"document. When there are very few documents containing more "
|
||||
f"than one sentence, the parser will not learn how to segment "
|
||||
f"longer texts into sentences."
|
||||
)
|
||||
|
||||
# profile labels
|
||||
|
@ -391,32 +331,13 @@ def debug_data(
|
|||
labels_dev = [label for label in gold_dev_data["deps"]]
|
||||
|
||||
if gold_train_unpreprocessed_data["n_nonproj"] > 0:
|
||||
msg.info(
|
||||
"Found {} nonprojective train sentence{}".format(
|
||||
gold_train_unpreprocessed_data["n_nonproj"],
|
||||
"s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
)
|
||||
n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
|
||||
msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
|
||||
if gold_dev_data["n_nonproj"] > 0:
|
||||
msg.info(
|
||||
"Found {} nonprojective dev sentence{}".format(
|
||||
gold_dev_data["n_nonproj"],
|
||||
"s" if gold_dev_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
)
|
||||
|
||||
msg.info(
|
||||
"{} {} in train data".format(
|
||||
len(labels_train_unpreprocessed),
|
||||
"label" if len(labels_train) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
msg.info(
|
||||
"{} {} in projectivized train data".format(
|
||||
len(labels_train), "label" if len(labels_train) == 1 else "labels"
|
||||
)
|
||||
)
|
||||
|
||||
n_nonproj = gold_dev_data["n_nonproj"]
|
||||
msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
|
||||
msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
|
||||
msg.info(f"{len(labels_train)} label(s) in projectivized train data")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_unpreprocessed_data["deps"].most_common(), counts=True
|
||||
)
|
||||
|
@ -426,9 +347,8 @@ def debug_data(
|
|||
for label in gold_train_unpreprocessed_data["deps"]:
|
||||
if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for label '{}' ({})".format(
|
||||
label, gold_train_unpreprocessed_data["deps"][label]
|
||||
)
|
||||
f"Low number of examples for label '{label}' "
|
||||
f"({gold_train_unpreprocessed_data['deps'][label]})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
|
@ -437,22 +357,19 @@ def debug_data(
|
|||
for label in gold_train_data["deps"]:
|
||||
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
||||
rare_projectivized_labels.append(
|
||||
"{}: {}".format(label, str(gold_train_data["deps"][label]))
|
||||
f"{label}: {gold_train_data['deps'][label]}"
|
||||
)
|
||||
|
||||
if len(rare_projectivized_labels) > 0:
|
||||
msg.warn(
|
||||
"Low number of examples for {} label{} in the "
|
||||
"projectivized dependency trees used for training. You may "
|
||||
"want to projectivize labels such as punct before "
|
||||
"training in order to improve parser performance.".format(
|
||||
len(rare_projectivized_labels),
|
||||
"s" if len(rare_projectivized_labels) > 1 else "",
|
||||
)
|
||||
f"Low number of examples for {len(rare_projectivized_labels)} "
|
||||
"label(s) in the projectivized dependency trees used for "
|
||||
"training. You may want to projectivize labels such as punct "
|
||||
"before training in order to improve parser performance."
|
||||
)
|
||||
msg.warn(
|
||||
"Projectivized labels with low numbers of examples: "
|
||||
"{}".format("\n".join(rare_projectivized_labels)),
|
||||
f"Projectivized labels with low numbers of examples: ",
|
||||
", ".join(rare_projectivized_labels),
|
||||
show=verbose,
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
@ -460,50 +377,44 @@ def debug_data(
|
|||
# labels only in train
|
||||
if set(labels_train) - set(labels_dev):
|
||||
msg.warn(
|
||||
"The following labels were found only in the train data: "
|
||||
"{}".format(", ".join(set(labels_train) - set(labels_dev))),
|
||||
"The following labels were found only in the train data:",
|
||||
", ".join(set(labels_train) - set(labels_dev)),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# labels only in dev
|
||||
if set(labels_dev) - set(labels_train):
|
||||
msg.warn(
|
||||
"The following labels were found only in the dev data: "
|
||||
+ ", ".join(set(labels_dev) - set(labels_train)),
|
||||
"The following labels were found only in the dev data:",
|
||||
", ".join(set(labels_dev) - set(labels_train)),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a parser, your data should include at "
|
||||
"least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
|
||||
f"To train a parser, your data should include at "
|
||||
f"least {DEP_LABEL_THRESHOLD} instances of each label.",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# multiple root labels
|
||||
if len(gold_train_unpreprocessed_data["roots"]) > 1:
|
||||
msg.warn(
|
||||
"Multiple root labels ({}) ".format(
|
||||
", ".join(gold_train_unpreprocessed_data["roots"])
|
||||
)
|
||||
+ "found in training data. spaCy's parser uses a single root "
|
||||
"label ROOT so this distinction will not be available."
|
||||
f"Multiple root labels "
|
||||
f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
|
||||
f"found in training data. spaCy's parser uses a single root "
|
||||
f"label ROOT so this distinction will not be available."
|
||||
)
|
||||
|
||||
# these should not happen, but just in case
|
||||
if gold_train_data["n_nonproj"] > 0:
|
||||
msg.fail(
|
||||
"Found {} nonprojective projectivized train sentence{}".format(
|
||||
gold_train_data["n_nonproj"],
|
||||
"s" if gold_train_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
f"Found {gold_train_data['n_nonproj']} nonprojective "
|
||||
f"projectivized train sentence(s)"
|
||||
)
|
||||
if gold_train_data["n_cycles"] > 0:
|
||||
msg.fail(
|
||||
"Found {} projectivized train sentence{} with cycles".format(
|
||||
gold_train_data["n_cycles"],
|
||||
"s" if gold_train_data["n_cycles"] > 1 else "",
|
||||
)
|
||||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
msg.divider("Summary")
|
||||
|
@ -511,36 +422,28 @@ def debug_data(
|
|||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||
if good_counts:
|
||||
msg.good(
|
||||
"{} {} passed".format(
|
||||
good_counts, "check" if good_counts == 1 else "checks"
|
||||
)
|
||||
)
|
||||
msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
|
||||
if warn_counts:
|
||||
msg.warn(
|
||||
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||
)
|
||||
if fail_counts:
|
||||
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||
|
||||
msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
|
||||
if fail_counts:
|
||||
msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _load_file(file_path, msg):
|
||||
file_name = file_path.parts[-1]
|
||||
if file_path.suffix == ".json":
|
||||
with msg.loading("Loading {}...".format(file_name)):
|
||||
with msg.loading(f"Loading {file_name}..."):
|
||||
data = srsly.read_json(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
msg.good(f"Loaded {file_name}")
|
||||
return data
|
||||
elif file_path.suffix == ".jsonl":
|
||||
with msg.loading("Loading {}...".format(file_name)):
|
||||
with msg.loading(f"Loading {file_name}..."):
|
||||
data = srsly.read_jsonl(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
msg.good(f"Loaded {file_name}")
|
||||
return data
|
||||
msg.fail(
|
||||
"Can't load file extension {}".format(file_path.suffix),
|
||||
f"Can't load file extension {file_path.suffix}",
|
||||
"Expected .json or .jsonl",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -604,14 +507,18 @@ def _compile_gold(examples, pipeline):
|
|||
|
||||
def _format_labels(labels, counts=False):
|
||||
if counts:
|
||||
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||
return ", ".join(["'{}'".format(l) for l in labels])
|
||||
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
||||
return ", ".join([f"'{l}'" for l in labels])
|
||||
|
||||
|
||||
def _get_examples_without_label(data, label):
|
||||
count = 0
|
||||
for ex in data:
|
||||
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
for label in ex.gold.ner
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import requests
|
||||
import os
|
||||
|
@ -50,7 +47,7 @@ def download(model, direct=False, *pip_args):
|
|||
sys.exit(dl)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
"You can now load the model via spacy.load('{}')".format(model_name),
|
||||
f"You can now load the model via spacy.load('{model_name}')",
|
||||
)
|
||||
# Only create symlink if the model is installed via a shortcut like 'en'.
|
||||
# There's no real advantage over an additional symlink for en_core_web_sm
|
||||
|
@ -69,10 +66,10 @@ def download(model, direct=False, *pip_args):
|
|||
# message and loading instructions, even if linking fails.
|
||||
msg.warn(
|
||||
"Download successful but linking failed",
|
||||
"Creating a shortcut link for '{}' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name: "
|
||||
"nlp = spacy.load('{}')".format(model, model_name),
|
||||
f"Creating a shortcut link for '{model}' didn't work (maybe you "
|
||||
f"don't have admin permissions?), but you can still load "
|
||||
f"the model via its full package name: "
|
||||
f"nlp = spacy.load('{model_name}')",
|
||||
)
|
||||
# If a model is downloaded and then loaded within the same process, our
|
||||
# is_package check currently fails, because pkg_resources.working_set
|
||||
|
@ -95,11 +92,11 @@ def get_json(url, desc):
|
|||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
"Server error ({})".format(r.status_code),
|
||||
"Couldn't fetch {}. Please find a model for your spaCy "
|
||||
"installation (v{}), and download it manually. For more "
|
||||
"details, see the documentation: "
|
||||
"https://spacy.io/usage/models".format(desc, about.__version__),
|
||||
f"Server error ({r.status_code})",
|
||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
||||
f"installation (v{about.__version__}), and download it manually. "
|
||||
f"For more details, see the documentation: "
|
||||
f"https://spacy.io/usage/models",
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
@ -111,7 +108,7 @@ def get_compatibility():
|
|||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
|
||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
|
@ -119,8 +116,8 @@ def get_version(model, comp):
|
|||
model = model.rsplit(".dev", 1)[0]
|
||||
if model not in comp:
|
||||
msg.fail(
|
||||
"No compatible model found for '{}' "
|
||||
"(spaCy v{}).".format(model, about.__version__),
|
||||
f"No compatible model found for '{model}' "
|
||||
f"(spaCy v{about.__version__}).",
|
||||
exits=1,
|
||||
)
|
||||
return comp[model][0]
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import msg
|
||||
|
@ -79,7 +76,7 @@ def evaluate(
|
|||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
)
|
||||
msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
|
||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||
if return_scores:
|
||||
return scorer.scores
|
||||
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str, basestring_, unicode_
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
@ -33,12 +29,12 @@ def info(model=None, markdown=False, silent=False):
|
|||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta["link"] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path.resolve())
|
||||
meta["link"] = str(model_path)
|
||||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = path2str(model_path)
|
||||
meta["source"] = str(model_path)
|
||||
if not silent:
|
||||
title = "Info about model '{}'".format(model)
|
||||
title = f"Info about model '{model}'"
|
||||
model_meta = {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||
}
|
||||
|
@ -49,7 +45,7 @@ def info(model=None, markdown=False, silent=False):
|
|||
return meta
|
||||
data = {
|
||||
"spaCy version": about.__version__,
|
||||
"Location": path2str(Path(__file__).parent.parent),
|
||||
"Location": str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": list_models(),
|
||||
|
@ -84,9 +80,9 @@ def print_markdown(data, title=None):
|
|||
"""
|
||||
markdown = []
|
||||
for key, value in data.items():
|
||||
if isinstance(value, basestring_) and Path(value).exists():
|
||||
if isinstance(value, str) and Path(value).exists():
|
||||
continue
|
||||
markdown.append("* **{}:** {}".format(key, unicode_(value)))
|
||||
markdown.append(f"* **{key}:** {value}")
|
||||
if title:
|
||||
print("\n## {}".format(title))
|
||||
print(f"\n## {title}")
|
||||
print("\n{}\n".format("\n".join(markdown)))
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
|
@ -91,8 +88,7 @@ def init_model(
|
|||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
msg.good(
|
||||
"Sucessfully compiled vocab",
|
||||
"{} entries, {} vectors".format(lex_added, vec_added),
|
||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
|
@ -177,9 +173,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None):
|
|||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||
else:
|
||||
if vectors_loc:
|
||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None:
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import symlink_to, path2str
|
||||
from ..compat import symlink_to
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -27,23 +24,23 @@ def link(origin, link_name, force=False, model_path=None):
|
|||
if not model_path.exists():
|
||||
msg.fail(
|
||||
"Can't locate model data",
|
||||
"The data should be located in {}".format(path2str(model_path)),
|
||||
f"The data should be located in {model_path}",
|
||||
exits=1,
|
||||
)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
msg.fail(
|
||||
"Can't find the spaCy data path to create model symlink",
|
||||
"Make sure a directory `/data` exists within your spaCy "
|
||||
"installation and try again. The data directory should be located "
|
||||
"here:".format(path=spacy_loc),
|
||||
f"Can't find the spaCy data path to create model symlink",
|
||||
f"Make sure a directory `/data` exists within your spaCy "
|
||||
f"installation and try again. The data directory should be located "
|
||||
f"here: {spacy_loc}",
|
||||
exits=1,
|
||||
)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.is_symlink() and not force:
|
||||
msg.fail(
|
||||
"Link '{}' already exists".format(link_name),
|
||||
f"Link '{link_name}' already exists",
|
||||
"To overwrite an existing link, use the --force flag",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -54,18 +51,18 @@ def link(origin, link_name, force=False, model_path=None):
|
|||
elif link_path.exists(): # does it exist otherwise?
|
||||
# NB: Check this last because valid symlinks also "exist".
|
||||
msg.fail(
|
||||
"Can't overwrite symlink '{}'".format(link_name),
|
||||
f"Can't overwrite symlink '{link_name}'",
|
||||
"This can happen if your data directory contains a directory or "
|
||||
"file of the same name.",
|
||||
exits=1,
|
||||
)
|
||||
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
details = f"{model_path} --> {link_path}"
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except: # noqa: E722
|
||||
# This is quite dirty, but just making sure other errors are caught.
|
||||
msg.fail(
|
||||
"Couldn't link model to '{}'".format(link_name),
|
||||
f"Couldn't link model to '{link_name}'",
|
||||
"Creating a symlink in spacy/data failed. Make sure you have the "
|
||||
"required permissions and try re-running the command as admin, or "
|
||||
"use a virtualenv. You can still import the model as a module and "
|
||||
|
@ -74,4 +71,4 @@ def link(origin, link_name, force=False, model_path=None):
|
|||
msg.text(details)
|
||||
raise
|
||||
msg.good("Linking successful", details)
|
||||
msg.text("You can now load the model via spacy.load('{}')".format(link_name))
|
||||
msg.text(f"You can now load the model via spacy.load('{link_name}')")
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import msg, get_raw_input
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
|||
for key in ("lang", "name", "version"):
|
||||
if key not in meta or meta[key] == "":
|
||||
msg.fail(
|
||||
"No '{}' setting found in meta.json".format(key),
|
||||
f"No '{key}' setting found in meta.json",
|
||||
"This setting is required to build your package.",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
|||
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
shutil.rmtree(str(package_path))
|
||||
else:
|
||||
msg.fail(
|
||||
"Package directory already exists",
|
||||
"Please delete the directory and try again, or use the "
|
||||
"`--force` flag to overwrite existing "
|
||||
"directories.".format(path=path2str(package_path)),
|
||||
"`--force` flag to overwrite existing directories.",
|
||||
exits=1,
|
||||
)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
shutil.copytree(str(input_path), str(package_path / model_name_v))
|
||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||
msg.good("Successfully created package '{}'".format(model_name_v), main_path)
|
||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
msg.text("To build the package, run `python setup.py sdist` in this directory.")
|
||||
|
||||
|
||||
|
@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
|
||||
TEMPLATE_SETUP = """
|
||||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import json
|
||||
from os import path, walk
|
||||
|
@ -190,9 +182,6 @@ include meta.json
|
|||
|
||||
|
||||
TEMPLATE_INIT = """
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
from spacy.util import load_model_from_init_py, get_model_meta
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import plac
|
||||
import random
|
||||
import numpy
|
||||
|
@ -154,9 +151,9 @@ def pretrain(
|
|||
msg.text("Reading input text from stdin...")
|
||||
texts = srsly.read_jsonl("-")
|
||||
|
||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||
with msg.loading(f"Loading model '{vectors_model}'..."):
|
||||
nlp = util.load_model(vectors_model)
|
||||
msg.good("Loaded model '{}'".format(vectors_model))
|
||||
msg.good(f"Loaded model '{vectors_model}'")
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(
|
||||
nlp,
|
||||
|
@ -173,7 +170,7 @@ def pretrain(
|
|||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
||||
if model_name:
|
||||
|
@ -221,7 +218,9 @@ def pretrain(
|
|||
skip_counter = 0
|
||||
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||
for batch_id, batch in enumerate(
|
||||
util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size)
|
||||
util.minibatch_by_words(
|
||||
(Example(doc=text) for text in texts), size=batch_size
|
||||
)
|
||||
):
|
||||
docs, count = make_docs(
|
||||
nlp,
|
||||
|
@ -246,7 +245,7 @@ def pretrain(
|
|||
# Reshuffle the texts if texts were loaded from a file
|
||||
random.shuffle(texts)
|
||||
if skip_counter > 0:
|
||||
msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
|
||||
msg.warn(f"Skipped {skip_counter} empty values")
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
|
@ -34,11 +31,11 @@ def profile(model, inputs=None, n_texts=10000):
|
|||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||
inputs = inputs[:n_inputs]
|
||||
with msg.loading("Loading model '{}'...".format(model)):
|
||||
with msg.loading(f"Loading model '{model}'..."):
|
||||
nlp = load_model(model)
|
||||
msg.good("Loaded model '{}'".format(model))
|
||||
msg.good(f"Loaded model '{model}'")
|
||||
texts = list(itertools.islice(inputs, n_texts))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
|
@ -60,7 +57,7 @@ def _read_inputs(loc, msg):
|
|||
input_path = Path(loc)
|
||||
if not input_path.exists() or not input_path.is_file():
|
||||
msg.fail("Not a valid input data file", loc, exits=1)
|
||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||
msg.info(f"Using data from {input_path.parts[-1]}")
|
||||
file_ = input_path.open()
|
||||
for line in file_:
|
||||
data = srsly.json_loads(line)
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import os
|
||||
import tqdm
|
||||
|
@ -12,12 +9,10 @@ import srsly
|
|||
from wasabi import msg
|
||||
import contextlib
|
||||
import random
|
||||
from collections import OrderedDict
|
||||
|
||||
from .._ml import create_default_optimizer
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus
|
||||
from ..compat import path2str
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
@ -148,14 +143,14 @@ def train(
|
|||
# the model and make sure the pipeline matches the pipeline setting. If
|
||||
# training starts from a blank model, intitalize the language class.
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
msg.text("Training pipeline: {}".format(pipeline))
|
||||
msg.text(f"Training pipeline: {pipeline}")
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
msg.text(f"Starting with base model '{base_model}'")
|
||||
nlp = util.load_model(base_model)
|
||||
if nlp.lang != lang:
|
||||
msg.fail(
|
||||
"Model language ('{}') doesn't match language specified as "
|
||||
"`lang` argument ('{}') ".format(nlp.lang, lang),
|
||||
f"Model language ('{nlp.lang}') doesn't match language "
|
||||
f"specified as `lang` argument ('{lang}') ",
|
||||
exits=1,
|
||||
)
|
||||
nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
|
||||
|
@ -187,15 +182,13 @@ def train(
|
|||
}
|
||||
if base_cfg != pipe_cfg:
|
||||
msg.fail(
|
||||
"The base textcat model configuration does"
|
||||
"not match the provided training options. "
|
||||
"Existing cfg: {}, provided cfg: {}".format(
|
||||
base_cfg, pipe_cfg
|
||||
),
|
||||
f"The base textcat model configuration does"
|
||||
f"not match the provided training options. "
|
||||
f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}",
|
||||
exits=1,
|
||||
)
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
msg.text(f"Starting with blank model '{lang}'")
|
||||
lang_cls = util.get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
for pipe in pipeline:
|
||||
|
@ -215,7 +208,7 @@ def train(
|
|||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
|
||||
if vectors:
|
||||
msg.text("Loading vector from model '{}'".format(vectors))
|
||||
msg.text(f"Loading vector from model '{vectors}'")
|
||||
_load_vectors(nlp, vectors)
|
||||
|
||||
# Multitask objectives
|
||||
|
@ -224,15 +217,15 @@ def train(
|
|||
if multitasks:
|
||||
if pipe_name not in pipeline:
|
||||
msg.fail(
|
||||
"Can't use multitask objective without '{}' in the "
|
||||
"pipeline".format(pipe_name)
|
||||
f"Can't use multitask objective without '{pipe_name}' in "
|
||||
f"the pipeline"
|
||||
)
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
for objective in multitasks.split(","):
|
||||
pipe.add_multitask_objective(objective)
|
||||
|
||||
# Prepare training corpus
|
||||
msg.text("Counting training words (limit={})".format(n_examples))
|
||||
msg.text(f"Counting training words (limit={n_examples})")
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
|
@ -248,22 +241,22 @@ def train(
|
|||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
msg.text(f"Loaded pretrained tok2vec for: {components}")
|
||||
|
||||
# Verify textcat config
|
||||
if "textcat" in pipeline:
|
||||
textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
|
||||
if textcat_positive_label and textcat_positive_label not in textcat_labels:
|
||||
msg.fail(
|
||||
"The textcat_positive_label (tpl) '{}' does not match any "
|
||||
"label in the training data.".format(textcat_positive_label),
|
||||
f"The textcat_positive_label (tpl) '{textcat_positive_label}' "
|
||||
f"does not match any label in the training data.",
|
||||
exits=1,
|
||||
)
|
||||
if textcat_positive_label and len(textcat_labels) != 2:
|
||||
msg.fail(
|
||||
"A textcat_positive_label (tpl) '{}' was provided for training "
|
||||
"data that does not appear to be a binary classification "
|
||||
"problem with two labels.".format(textcat_positive_label),
|
||||
"A textcat_positive_label (tpl) '{textcat_positive_label}' was "
|
||||
"provided for training data that does not appear to be a "
|
||||
"binary classification problem with two labels.",
|
||||
exits=1,
|
||||
)
|
||||
train_data = corpus.train_data(
|
||||
|
@ -302,20 +295,20 @@ def train(
|
|||
break
|
||||
if base_model and set(textcat_labels) != train_labels:
|
||||
msg.fail(
|
||||
"Cannot extend textcat model using data with different "
|
||||
"labels. Base model labels: {}, training data labels: "
|
||||
"{}.".format(textcat_labels, list(train_labels)),
|
||||
f"Cannot extend textcat model using data with different "
|
||||
f"labels. Base model labels: {textcat_labels}, training data "
|
||||
f"labels: {list(train_labels)}",
|
||||
exits=1,
|
||||
)
|
||||
if textcat_multilabel:
|
||||
msg.text(
|
||||
"Textcat evaluation score: ROC AUC score macro-averaged across "
|
||||
"the labels '{}'".format(", ".join(textcat_labels))
|
||||
f"Textcat evaluation score: ROC AUC score macro-averaged across "
|
||||
f"the labels '{', '.join(textcat_labels)}'"
|
||||
)
|
||||
elif textcat_positive_label and len(textcat_labels) == 2:
|
||||
msg.text(
|
||||
"Textcat evaluation score: F1-score for the "
|
||||
"label '{}'".format(textcat_positive_label)
|
||||
f"Textcat evaluation score: F1-score for the "
|
||||
f"label '{textcat_positive_label}'"
|
||||
)
|
||||
elif len(textcat_labels) > 1:
|
||||
if len(textcat_labels) == 2:
|
||||
|
@ -325,8 +318,8 @@ def train(
|
|||
"an evaluation on the positive class."
|
||||
)
|
||||
msg.text(
|
||||
"Textcat evaluation score: F1-score macro-averaged across "
|
||||
"the labels '{}'".format(", ".join(textcat_labels))
|
||||
f"Textcat evaluation score: F1-score macro-averaged across "
|
||||
f"the labels '{', '.join(textcat_labels)}'"
|
||||
)
|
||||
else:
|
||||
msg.fail(
|
||||
|
@ -471,8 +464,8 @@ def train(
|
|||
for cat, cat_score in textcats_per_cat.items():
|
||||
if cat_score.get("roc_auc_score", 0) < 0:
|
||||
msg.warn(
|
||||
"Textcat ROC AUC score is undefined due to "
|
||||
"only one value in label '{}'.".format(cat)
|
||||
f"Textcat ROC AUC score is undefined due to "
|
||||
f"only one value in label '{cat}'."
|
||||
)
|
||||
msg.row(progress, **row_settings)
|
||||
# Early stopping
|
||||
|
@ -485,12 +478,10 @@ def train(
|
|||
best_score = current_score
|
||||
if iter_since_best >= n_early_stopping:
|
||||
msg.text(
|
||||
"Early stopping, best iteration "
|
||||
"is: {}".format(i - iter_since_best)
|
||||
f"Early stopping, best iteration is: {i - iter_since_best}"
|
||||
)
|
||||
msg.text(
|
||||
"Best score = {}; Final iteration "
|
||||
"score = {}".format(best_score, current_score)
|
||||
f"Best score = {best_score}; Final iteration score = {current_score}"
|
||||
)
|
||||
break
|
||||
finally:
|
||||
|
@ -560,11 +551,11 @@ def _collate_best_model(meta, output_path, components):
|
|||
for component in components:
|
||||
bests[component] = _find_best(output_path, component)
|
||||
best_dest = output_path / "model-best"
|
||||
shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
|
||||
shutil.copytree(str(output_path / "model-final"), str(best_dest))
|
||||
for component, best_component_src in bests.items():
|
||||
shutil.rmtree(path2str(best_dest / component))
|
||||
shutil.rmtree(str(best_dest / component))
|
||||
shutil.copytree(
|
||||
path2str(best_component_src / component), path2str(best_dest / component)
|
||||
str(best_component_src / component), str(best_dest / component)
|
||||
)
|
||||
accs = srsly.read_json(best_component_src / "accuracy.json")
|
||||
for metric in _get_metrics(component):
|
||||
|
@ -627,10 +618,8 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths):
|
|||
if has_beam_widths:
|
||||
row_head.insert(1, "Beam W.")
|
||||
# remove duplicates
|
||||
row_head_dict = OrderedDict()
|
||||
row_head_dict.update({k: 1 for k in row_head})
|
||||
output_stats_dict = OrderedDict()
|
||||
output_stats_dict.update({k: 1 for k in output_stats})
|
||||
row_head_dict = {k: 1 for k in row_head}
|
||||
output_stats_dict = {k: 1 for k in output_stats}
|
||||
return row_head_dict.keys(), output_stats_dict.keys()
|
||||
|
||||
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import requests
|
||||
import srsly
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import path2str
|
||||
from ..util import get_data_path
|
||||
from .. import about
|
||||
|
||||
|
@ -21,7 +17,7 @@ def validate():
|
|||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
"Server error ({})".format(r.status_code),
|
||||
f"Server error ({r.status_code})",
|
||||
"Couldn't fetch compatibility table.",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -32,7 +28,7 @@ def validate():
|
|||
current_compat = compat.get(version)
|
||||
if not current_compat:
|
||||
msg.fail(
|
||||
"Can't find spaCy v{} in compatibility table".format(version),
|
||||
f"Can't find spaCy v{version} in compatibility table",
|
||||
about.__compatibility__,
|
||||
exits=1,
|
||||
)
|
||||
|
@ -52,8 +48,8 @@ def validate():
|
|||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
spacy_dir = Path(__file__).parent.parent
|
||||
|
||||
msg.divider("Installed models (spaCy v{})".format(about.__version__))
|
||||
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
||||
msg.info(f"spaCy installation: {spacy_dir}")
|
||||
|
||||
if model_links or model_pkgs:
|
||||
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||
|
@ -72,15 +68,15 @@ def validate():
|
|||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
msg.text(
|
||||
"The following models are not available for spaCy "
|
||||
"v{}: {}".format(about.__version__, ", ".join(na_models))
|
||||
f"The following models are not available for spaCy "
|
||||
f"v{about.__version__}: {', '.join(na_models)}"
|
||||
)
|
||||
if incompat_links:
|
||||
msg.text(
|
||||
"You may also want to overwrite the incompatible links using the "
|
||||
"`python -m spacy link` command with `--force`, or remove them "
|
||||
"from the data directory. "
|
||||
"Data path: {path}".format(path=path2str(get_data_path()))
|
||||
f"You may also want to overwrite the incompatible links using the "
|
||||
f"`python -m spacy link` command with `--force`, or remove them "
|
||||
f"from the data directory. "
|
||||
f"Data path: {get_data_path()}"
|
||||
)
|
||||
if incompat_models or incompat_links:
|
||||
sys.exit(1)
|
||||
|
@ -128,7 +124,7 @@ def get_model_row(compat, name, data, msg, model_type="package"):
|
|||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||
return (model_type, name, data["name"], version, comp)
|
||||
|
||||
|
||||
|
|
102
spacy/compat.py
102
spacy/compat.py
|
@ -1,4 +1,3 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
Helpers for Python and platform compatibility. To distinguish them from
|
||||
the builtin functions, replacement functions are suffixed with an underscore,
|
||||
|
@ -6,13 +5,8 @@ e.g. `unicode_`.
|
|||
|
||||
DOCS: https://spacy.io/api/top-level#compat
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import itertools
|
||||
import ast
|
||||
import types
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -46,45 +40,11 @@ copy_reg = copy_reg
|
|||
CudaStream = CudaStream
|
||||
cupy = cupy
|
||||
copy_array = copy_array
|
||||
izip = getattr(itertools, "izip", zip)
|
||||
|
||||
is_windows = sys.platform.startswith("win")
|
||||
is_linux = sys.platform.startswith("linux")
|
||||
is_osx = sys.platform == "darwin"
|
||||
|
||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
||||
is_python2 = sys.version_info[0] == 2
|
||||
is_python3 = sys.version_info[0] == 3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
||||
|
||||
if is_python2:
|
||||
bytes_ = str
|
||||
unicode_ = unicode # noqa: F821
|
||||
basestring_ = basestring # noqa: F821
|
||||
input_ = raw_input # noqa: F821
|
||||
path2str = lambda path: str(path).decode("utf8")
|
||||
class_types = (type, types.ClassType)
|
||||
|
||||
elif is_python3:
|
||||
bytes_ = bytes
|
||||
unicode_ = str
|
||||
basestring_ = str
|
||||
input_ = input
|
||||
path2str = lambda path: str(path)
|
||||
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
|
||||
|
||||
|
||||
def b_to_str(b_str):
|
||||
"""Convert a bytes object to a string.
|
||||
|
||||
b_str (bytes): The object to convert.
|
||||
RETURNS (unicode): The converted string.
|
||||
"""
|
||||
if is_python2:
|
||||
return b_str
|
||||
# Important: if no encoding is set, string becomes "b'...'"
|
||||
return str(b_str, encoding="utf8")
|
||||
|
||||
|
||||
def symlink_to(orig, dest):
|
||||
"""Create a symlink. Used for model shortcut links.
|
||||
|
@ -95,9 +55,7 @@ def symlink_to(orig, dest):
|
|||
if is_windows:
|
||||
import subprocess
|
||||
|
||||
subprocess.check_call(
|
||||
["mklink", "/d", path2str(orig), path2str(dest)], shell=True
|
||||
)
|
||||
subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True)
|
||||
else:
|
||||
orig.symlink_to(dest)
|
||||
|
||||
|
@ -108,19 +66,17 @@ def symlink_remove(link):
|
|||
link (unicode / Path): The path to the symlink.
|
||||
"""
|
||||
# https://stackoverflow.com/q/26554135/6400719
|
||||
if os.path.isdir(path2str(link)) and is_windows:
|
||||
if os.path.isdir(str(link)) and is_windows:
|
||||
# this should only be on Py2.7 and windows
|
||||
os.rmdir(path2str(link))
|
||||
os.rmdir(str(link))
|
||||
else:
|
||||
os.unlink(path2str(link))
|
||||
os.unlink(str(link))
|
||||
|
||||
|
||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||
def is_config(windows=None, linux=None, osx=None, **kwargs):
|
||||
"""Check if a specific configuration of Python version and operating system
|
||||
matches the user's setup. Mostly used to display targeted error messages.
|
||||
|
||||
python2 (bool): spaCy is executed with Python 2.x.
|
||||
python3 (bool): spaCy is executed with Python 3.x.
|
||||
windows (bool): spaCy is executed on Windows.
|
||||
linux (bool): spaCy is executed on Linux.
|
||||
osx (bool): spaCy is executed on OS X or macOS.
|
||||
|
@ -129,53 +85,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
|||
DOCS: https://spacy.io/api/top-level#compat.is_config
|
||||
"""
|
||||
return (
|
||||
python2 in (None, is_python2)
|
||||
and python3 in (None, is_python3)
|
||||
and windows in (None, is_windows)
|
||||
windows in (None, is_windows)
|
||||
and linux in (None, is_linux)
|
||||
and osx in (None, is_osx)
|
||||
)
|
||||
|
||||
|
||||
def import_file(name, loc):
|
||||
"""Import module from a file. Used to load models from a directory.
|
||||
|
||||
name (unicode): Name of module to load.
|
||||
loc (unicode / Path): Path to the file.
|
||||
RETURNS: The loaded module.
|
||||
"""
|
||||
loc = path2str(loc)
|
||||
if is_python_pre_3_5:
|
||||
import imp
|
||||
|
||||
return imp.load_source(name, loc)
|
||||
else:
|
||||
import importlib.util
|
||||
|
||||
spec = importlib.util.spec_from_file_location(name, str(loc))
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def unescape_unicode(string):
|
||||
"""Python2.7's re module chokes when compiling patterns that have ranges
|
||||
between escaped unicode codepoints if the two codepoints are unrecognised
|
||||
in the unicode database. For instance:
|
||||
|
||||
re.compile('[\\uAA77-\\uAA79]').findall("hello")
|
||||
|
||||
Ends up matching every character (on Python 2). This problem doesn't occur
|
||||
if we're dealing with unicode literals.
|
||||
"""
|
||||
if string is None:
|
||||
return string
|
||||
# We only want to unescape the unicode, so we first must protect the other
|
||||
# backslashes.
|
||||
string = string.replace("\\", "\\\\")
|
||||
# Now we remove that protection for the unicode.
|
||||
string = string.replace("\\\\u", "\\u")
|
||||
string = string.replace("\\\\U", "\\U")
|
||||
# Now we unescape by evaling the string with the AST. This can't execute
|
||||
# code -- it only does the representational level.
|
||||
return ast.literal_eval("u'''" + string + "'''")
|
||||
|
|
|
@ -1,15 +1,11 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
spaCy's built in visualization suite for dependencies and named entities.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .render import DependencyRenderer, EntityRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..compat import b_to_str
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
from ..util import is_in_jupyter
|
||||
|
||||
|
@ -92,20 +88,20 @@ def serve(
|
|||
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server(host, port, app)
|
||||
print("\nUsing the '{}' visualizer".format(style))
|
||||
print("Serving on http://{}:{} ...\n".format(host, port))
|
||||
print(f"\nUsing the '{style}' visualizer")
|
||||
print(f"Serving on http://{host}:{port} ...\n")
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
print("Shutting down server on port {}.".format(port))
|
||||
print(f"Shutting down server on port {port}.")
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
# Headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
||||
start_response(b_to_str(b"200 OK"), headers)
|
||||
headers = [("Content-type", "text/html; charset=utf-8")]
|
||||
start_response("200 OK", headers)
|
||||
res = _html["parsed"].encode(encoding="utf-8")
|
||||
return [res]
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import uuid
|
||||
|
||||
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
|
||||
|
@ -55,7 +52,7 @@ class DependencyRenderer(object):
|
|||
settings = p.get("settings", {})
|
||||
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||
render_id = "{}-{}".format(id_prefix, i)
|
||||
render_id = f"{id_prefix}-{i}"
|
||||
svg = self.render_svg(render_id, p["words"], p["arcs"])
|
||||
rendered.append(svg)
|
||||
if page:
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Setting explicit height and max-width: none on the SVG is required for
|
||||
# Jupyter to render it properly in a cell
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import warnings
|
||||
import inspect
|
||||
|
@ -12,7 +9,7 @@ def add_codes(err_cls):
|
|||
class ErrorsWithCodes(object):
|
||||
def __getattribute__(self, code):
|
||||
msg = getattr(err_cls, code)
|
||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||
return f"[{code}] {msg}"
|
||||
|
||||
return ErrorsWithCodes()
|
||||
|
||||
|
@ -98,8 +95,6 @@ class Warnings(object):
|
|||
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
|
||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
||||
"package installed.")
|
||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
||||
"'n_process' will be set to 1.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||
|
@ -550,6 +545,7 @@ class Errors(object):
|
|||
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||
"gold annotations: {gold_dict}")
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||
|
@ -573,10 +569,10 @@ class MatchPatternError(ValueError):
|
|||
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
||||
ID, i.e. the index of the added pattern.
|
||||
"""
|
||||
msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
|
||||
msg = f"Invalid token patterns for matcher rule '{key}'\n"
|
||||
for pattern_idx, error_msgs in errors.items():
|
||||
pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
|
||||
msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
|
||||
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
||||
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
||||
ValueError.__init__(self, msg)
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# cython: profile=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import re
|
||||
import random
|
||||
import numpy
|
||||
|
@ -14,7 +11,6 @@ import srsly
|
|||
from .syntax import nonproj
|
||||
from .tokens import Doc, Span
|
||||
from .errors import Errors, AlignmentError, user_warning, Warnings
|
||||
from .compat import path2str, basestring_
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -157,7 +153,7 @@ class GoldCorpus(object):
|
|||
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
|
||||
|
||||
def __del__(self):
|
||||
shutil.rmtree(path2str(self.tmp_dir))
|
||||
shutil.rmtree(self.tmp_dir)
|
||||
|
||||
@staticmethod
|
||||
def write_msgpack(directory, examples, limit=0):
|
||||
|
@ -167,7 +163,7 @@ class GoldCorpus(object):
|
|||
for i, example in enumerate(examples):
|
||||
ex_dict = example.to_dict()
|
||||
text = example.text
|
||||
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
|
||||
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
|
||||
n += 1
|
||||
if limit and n >= limit:
|
||||
break
|
||||
|
@ -221,7 +217,7 @@ class GoldCorpus(object):
|
|||
examples = [Example.from_dict(ex_dict, doc=text)]
|
||||
else:
|
||||
supported = ("json", "jsonl", "msg")
|
||||
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
||||
raise ValueError(Errors.E124.format(path=loc, formats=supported))
|
||||
for example in examples:
|
||||
yield example
|
||||
i += 1
|
||||
|
@ -862,7 +858,7 @@ cdef class Example:
|
|||
converted_examples = []
|
||||
for ex in examples:
|
||||
# convert string to Doc to Example
|
||||
if isinstance(ex, basestring_):
|
||||
if isinstance(ex, str):
|
||||
if keep_raw_text:
|
||||
converted_examples.append(Example(doc=ex))
|
||||
else:
|
||||
|
@ -876,7 +872,7 @@ cdef class Example:
|
|||
doc, gold = ex
|
||||
gold_dict = {}
|
||||
# convert string to Doc
|
||||
if isinstance(doc, basestring_) and not keep_raw_text:
|
||||
if isinstance(doc, str) and not keep_raw_text:
|
||||
doc = make_doc(doc)
|
||||
# convert dict to GoldParse
|
||||
if isinstance(gold, dict):
|
||||
|
@ -988,7 +984,7 @@ cdef class GoldParse:
|
|||
# Translate the None values to '-', to make processing easier.
|
||||
# See Issue #2603
|
||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||
if not isinstance(entities[0], basestring_):
|
||||
if not isinstance(entities[0], str):
|
||||
# Assume we have entities specified by character offset.
|
||||
entities = biluo_tags_from_offsets(doc, entities)
|
||||
|
||||
|
@ -1107,7 +1103,7 @@ cdef class GoldParse:
|
|||
cycle = nonproj.contains_cycle(self.heads)
|
||||
if cycle is not None:
|
||||
raise ValueError(Errors.E069.format(cycle=cycle,
|
||||
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
|
||||
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
|
||||
doc_tokens=" ".join(words[:50])))
|
||||
|
||||
def __len__(self):
|
||||
|
|
13
spacy/kb.pyx
13
spacy/kb.pyx
|
@ -1,22 +1,17 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
# coding: utf8
|
||||
from spacy.errors import Errors, Warnings, user_warning
|
||||
|
||||
from pathlib import Path
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from os import path
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .errors import Errors, Warnings, user_warning
|
||||
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
|
@ -584,5 +579,3 @@ cdef class Reader:
|
|||
cdef int _read(self, void* value, size_t size) except -1:
|
||||
status = fread(value, size, 1, self._fp)
|
||||
return status
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-af
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
من
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/Alir3z4/stop-words
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import LEMMA, PRON_LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding=utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import ALPHA
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
split_chars = lambda char: list(char.strip().split(" "))
|
||||
merge_chars = lambda char: char.strip().replace(" ", "|")
|
||||
group_chars = lambda char: char.strip().replace(" ", "")
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/Alir3z4/stop-words
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import LEMMA, PRON_LEMMA
|
||||
|
||||
# Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php
|
||||
|
|
|
@ -1,10 +1,7 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
Special-case rules for normalizing tokens to improve the model's predictions.
|
||||
For example 'mysterium' vs 'mysterie' and similar.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Sources:
|
||||
# 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Source: Handpicked by Jens Dahl Møllerhøj.
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,11 +1,7 @@
|
|||
# encoding: utf8
|
||||
"""
|
||||
Tokenizer Exceptions.
|
||||
Source: https://forkortelse.dk/ and various others.
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# Here we only want to include the absolute most common words. Otherwise,
|
||||
# this list would get impossibly long for German – especially considering the
|
||||
# old vs. new spelling rules, and all possible cases.
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tag_map_general import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.el.examples import sentences
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def get_pos_from_wiktionary():
|
||||
import re
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# These exceptions are used to add NORM values based on a token's ORTH value.
|
||||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Stop words
|
||||
# Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||
from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .tag_map import TAG_MAP
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import LEMMA, PRON_LEMMA
|
||||
|
||||
# Several entries here look pretty suspicious. These will get the POS SCONJ
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
_exc = {
|
||||
# Slang and abbreviations
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user