mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' into master-tmp
This commit is contained in:
commit
644074b954
106
.github/contributors/tiangolo.md
vendored
Normal file
106
.github/contributors/tiangolo.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI GmbH](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Sebastián Ramírez |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | 2020-07-01 |
|
||||
| GitHub username | tiangolo |
|
||||
| Website (optional) | |
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -44,6 +44,7 @@ __pycache__/
|
|||
.env*
|
||||
.~env/
|
||||
.venv
|
||||
env3.6/
|
||||
venv/
|
||||
env3.*/
|
||||
.dev
|
||||
|
@ -119,3 +120,6 @@ Desktop.ini
|
|||
|
||||
# Pycharm project files
|
||||
*.idea
|
||||
|
||||
# IPython
|
||||
.ipynb_checkpoints/
|
||||
|
|
23
.travis.yml
23
.travis.yml
|
@ -1,23 +0,0 @@
|
|||
language: python
|
||||
sudo: false
|
||||
cache: pip
|
||||
dist: trusty
|
||||
group: edge
|
||||
python:
|
||||
- "2.7"
|
||||
os:
|
||||
- linux
|
||||
install:
|
||||
- "pip install -r requirements.txt"
|
||||
- "python setup.py build_ext --inplace"
|
||||
- "pip install -e ."
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "python -m pytest --tb=native spacy"
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
||||
notifications:
|
||||
slack:
|
||||
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
|
||||
email: false
|
|
@ -280,23 +280,7 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written in an **intersection of Python 2 and Python 3**.
|
||||
This is easy in Cython, but somewhat ugly in Python. Logic that deals with
|
||||
Python or platform compatibility should only live in
|
||||
[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
|
||||
functions, replacement functions are suffixed with an underscore, for example
|
||||
`unicode_`. If you need to access the user's version or platform information,
|
||||
for example to show more specific error messages, you can use the `is_config()`
|
||||
helper function.
|
||||
|
||||
```python
|
||||
from .compat import unicode_, is_config
|
||||
|
||||
compatible_unicode = unicode_('hello world')
|
||||
if is_config(windows=True, python2=True):
|
||||
print("You are using Python 2 on Windows.")
|
||||
```
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**.
|
||||
Code that interacts with the file-system should accept objects that follow the
|
||||
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
||||
If the function is user-facing and takes a path as an argument, it should check
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
recursive-include include *.h
|
||||
recursive-include spacy *.txt *.pyx *.pxd
|
||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg
|
||||
include LICENSE
|
||||
include README.md
|
||||
include bin/spacy
|
||||
include pyproject.toml
|
||||
recursive-exclude spacy/lang *.json
|
||||
recursive-include spacy/lang *.json.gz
|
||||
|
|
4
Makefile
4
Makefile
|
@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
|
|||
version := $(shell "bin/get-version.sh")
|
||||
|
||||
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core
|
||||
chmod a+rx $@
|
||||
cp $@ dist/spacy.pex
|
||||
|
||||
|
@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
|
|||
|
||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
||||
$(VENV)/bin/pip wheel . -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core -w ./wheelhouse
|
||||
touch $@
|
||||
|
||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
|
||||
|
|
16
README.md
16
README.md
|
@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
|
|||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
|
||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||
|
@ -98,12 +97,19 @@ For detailed installation instructions, see the
|
|||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
|
||||
- **Python version**: Python 3.6+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
[pip]: https://pypi.org/project/spacy/
|
||||
[conda]: https://anaconda.org/conda-forge/spacy
|
||||
|
||||
> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
|
||||
> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
|
||||
> providers and other tooling to support it. This means that in order to run
|
||||
> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
|
||||
> the library and its Cython dependencies locally. If this is causing problems
|
||||
> for you, the easiest solution is to **use Python 3.7** in the meantime.
|
||||
|
||||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||
|
@ -188,7 +194,7 @@ pip install https://github.com/explosion/spacy-models/releases/download/en_core_
|
|||
|
||||
### Loading and using models
|
||||
|
||||
To load a model, use `spacy.load()` with the model name, a shortcut link or a
|
||||
To load a model, use `spacy.load()` with the model name or a
|
||||
path to the model data directory.
|
||||
|
||||
```python
|
||||
|
@ -263,9 +269,7 @@ and git preinstalled.
|
|||
Install a version of the
|
||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
||||
matches the version that was used to compile your Python interpreter. For
|
||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
|
||||
VS 2015 (Python 3.5).
|
||||
matches the version that was used to compile your Python interpreter.
|
||||
|
||||
## Run tests
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ jobs:
|
|||
inputs:
|
||||
versionSpec: '3.7'
|
||||
- script: |
|
||||
pip install flake8
|
||||
pip install flake8==3.5.0
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
displayName: 'flake8'
|
||||
|
||||
|
@ -35,12 +35,6 @@ jobs:
|
|||
dependsOn: 'Validate'
|
||||
strategy:
|
||||
matrix:
|
||||
Python35Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.5'
|
||||
Python35Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.5'
|
||||
Python36Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.6'
|
||||
|
@ -58,7 +52,7 @@ jobs:
|
|||
# imageName: 'vs2017-win2016'
|
||||
# python.version: '3.7'
|
||||
# Python37Mac:
|
||||
# imageName: 'macos-10.13'
|
||||
# imageName: 'macos-10.14'
|
||||
# python.version: '3.7'
|
||||
Python38Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
|
|
169
bin/cythonize.py
169
bin/cythonize.py
|
@ -1,169 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
""" cythonize.py
|
||||
|
||||
Cythonize pyx files into C++ files as needed.
|
||||
|
||||
Usage: cythonize.py [root]
|
||||
|
||||
Checks pyx files to see if they have been changed relative to their
|
||||
corresponding C++ files. If they have, then runs cython on these files to
|
||||
recreate the C++ files.
|
||||
|
||||
Additionally, checks pxd files and setup.py if they have been changed. If
|
||||
they have, rebuilds everything.
|
||||
|
||||
Change detection based on file hashes stored in JSON format.
|
||||
|
||||
For now, this script should be run by developers when changing Cython files
|
||||
and the resulting C++ files checked in, so that end-users (and Python-only
|
||||
developers) do not get the Cython dependencies.
|
||||
|
||||
Based upon:
|
||||
|
||||
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
|
||||
https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
|
||||
|
||||
Note: this script does not check any of the dependent C++ libraries.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
import subprocess
|
||||
import argparse
|
||||
|
||||
|
||||
HASH_FILE = "cythonize.json"
|
||||
|
||||
|
||||
def process_pyx(fromfile, tofile, language_level="-2"):
|
||||
print("Processing %s" % fromfile)
|
||||
try:
|
||||
from Cython.Compiler.Version import version as cython_version
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
||||
raise Exception("Require Cython >= 0.19")
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
flags = ["--fast-fail", language_level]
|
||||
if tofile.endswith(".cpp"):
|
||||
flags += ["--cplus"]
|
||||
|
||||
try:
|
||||
try:
|
||||
r = subprocess.call(
|
||||
["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
|
||||
) # See Issue #791
|
||||
if r != 0:
|
||||
raise Exception("Cython failed")
|
||||
except OSError:
|
||||
# There are ways of installing Cython that don't result in a cython
|
||||
# executable on the path, see gh-2397.
|
||||
r = subprocess.call(
|
||||
[
|
||||
sys.executable,
|
||||
"-c",
|
||||
"import sys; from Cython.Compiler.Main import "
|
||||
"setuptools_main as main; sys.exit(main())",
|
||||
]
|
||||
+ flags
|
||||
+ ["-o", tofile, fromfile]
|
||||
)
|
||||
if r != 0:
|
||||
raise Exception("Cython failed")
|
||||
except OSError:
|
||||
raise OSError("Cython needs to be installed")
|
||||
|
||||
|
||||
def preserve_cwd(path, func, *args):
|
||||
orig_cwd = os.getcwd()
|
||||
try:
|
||||
os.chdir(path)
|
||||
func(*args)
|
||||
finally:
|
||||
os.chdir(orig_cwd)
|
||||
|
||||
|
||||
def load_hashes(filename):
|
||||
try:
|
||||
return json.load(open(filename))
|
||||
except (ValueError, IOError):
|
||||
return {}
|
||||
|
||||
|
||||
def save_hashes(hash_db, filename):
|
||||
with open(filename, "w") as f:
|
||||
f.write(json.dumps(hash_db))
|
||||
|
||||
|
||||
def get_hash(path):
|
||||
return hashlib.md5(open(path, "rb").read()).hexdigest()
|
||||
|
||||
|
||||
def hash_changed(base, path, db):
|
||||
full_path = os.path.normpath(os.path.join(base, path))
|
||||
return not get_hash(full_path) == db.get(full_path)
|
||||
|
||||
|
||||
def hash_add(base, path, db):
|
||||
full_path = os.path.normpath(os.path.join(base, path))
|
||||
db[full_path] = get_hash(full_path)
|
||||
|
||||
|
||||
def process(base, filename, db):
|
||||
root, ext = os.path.splitext(filename)
|
||||
if ext in [".pyx", ".cpp"]:
|
||||
if hash_changed(base, filename, db) or not os.path.isfile(
|
||||
os.path.join(base, root + ".cpp")
|
||||
):
|
||||
preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
|
||||
hash_add(base, root + ".cpp", db)
|
||||
hash_add(base, root + ".pyx", db)
|
||||
|
||||
|
||||
def check_changes(root, db):
|
||||
res = False
|
||||
new_db = {}
|
||||
|
||||
setup_filename = "setup.py"
|
||||
hash_add(".", setup_filename, new_db)
|
||||
if hash_changed(".", setup_filename, db):
|
||||
res = True
|
||||
|
||||
for base, _, files in os.walk(root):
|
||||
for filename in files:
|
||||
if filename.endswith(".pxd"):
|
||||
hash_add(base, filename, new_db)
|
||||
if hash_changed(base, filename, db):
|
||||
res = True
|
||||
|
||||
if res:
|
||||
db.clear()
|
||||
db.update(new_db)
|
||||
return res
|
||||
|
||||
|
||||
def run(root):
|
||||
db = load_hashes(HASH_FILE)
|
||||
|
||||
try:
|
||||
check_changes(root, db)
|
||||
for base, _, files in os.walk(root):
|
||||
for filename in files:
|
||||
process(base, filename, db)
|
||||
finally:
|
||||
save_hashes(db, HASH_FILE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cythonize pyx files into C++ files as needed"
|
||||
)
|
||||
parser.add_argument("root", help="root directory")
|
||||
args = parser.parse_args()
|
||||
run(args.root)
|
|
@ -12,11 +12,11 @@ from ud_train import write_conllu
|
|||
from spacy.lang.lex_attrs import word_shape
|
||||
from spacy.util import get_lang_class
|
||||
|
||||
# All languages in spaCy - in UD format (note that Norwegian is 'no' instead of 'nb')
|
||||
ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, fa, fi, fr,"
|
||||
"ga, he, hi, hr, hu, id, is, it, ja, kn, ko, lt, lv, mr, no,"
|
||||
# All languages in spaCy format (note that Norwegian is 'no' in UD - gets remapped later)
|
||||
ALL_LANGUAGES = ("af, ar, bg, bn, ca, cs, da, de, el, en, es, et, eu, fa, fi, fr,"
|
||||
"ga, gu, he, hi, hr, hu, hy, id, is, it, ja, kn, ko, lb, lij, lt, lv, ml, mr, nb,"
|
||||
"nl, pl, pt, ro, ru, si, sk, sl, sq, sr, sv, ta, te, th, tl,"
|
||||
"tr, tt, uk, ur, vi, zh")
|
||||
"tr, tt, uk, ur, vi, yo, zh")
|
||||
|
||||
# Non-parsing tasks that will be evaluated (works for default models)
|
||||
EVAL_NO_PARSE = ['Tokens', 'Words', 'Lemmas', 'Sentences', 'Feats']
|
||||
|
@ -251,39 +251,43 @@ def main(out_path, ud_dir, check_parse=False, langs=ALL_LANGUAGES, exclude_train
|
|||
|
||||
# initialize all models with the multi-lang model
|
||||
for lang in languages:
|
||||
models[lang] = [multi] if multi else []
|
||||
# add default models if we don't want to evaluate parsing info
|
||||
if not check_parse:
|
||||
# Norwegian is 'nb' in spaCy but 'no' in the UD corpora
|
||||
if lang == 'no':
|
||||
models['no'].append(load_default_model_sentencizer('nb'))
|
||||
else:
|
||||
models[lang].append(load_default_model_sentencizer(lang))
|
||||
UD_lang = lang
|
||||
# Norwegian is 'nb' in spaCy but 'no' in the UD corpora
|
||||
if lang == "nb":
|
||||
UD_lang = "no"
|
||||
try:
|
||||
models[UD_lang] = [multi] if multi else []
|
||||
# add default models if we don't want to evaluate parsing info
|
||||
if not check_parse:
|
||||
models[UD_lang].append(load_default_model_sentencizer(lang))
|
||||
except:
|
||||
print(f"Exception initializing lang {lang} - skipping")
|
||||
|
||||
# language-specific trained models
|
||||
if not exclude_trained_models:
|
||||
if 'de' in models:
|
||||
models['de'].append(load_model('de_core_news_sm'))
|
||||
models['de'].append(load_model('de_core_news_md'))
|
||||
if 'el' in models:
|
||||
models['el'].append(load_model('el_core_news_sm'))
|
||||
models['el'].append(load_model('el_core_news_md'))
|
||||
if 'en' in models:
|
||||
models['en'].append(load_model('en_core_web_sm'))
|
||||
models['en'].append(load_model('en_core_web_md'))
|
||||
models['en'].append(load_model('en_core_web_lg'))
|
||||
if 'es' in models:
|
||||
models['es'].append(load_model('es_core_news_sm'))
|
||||
models['es'].append(load_model('es_core_news_md'))
|
||||
if 'fr' in models:
|
||||
models['fr'].append(load_model('fr_core_news_sm'))
|
||||
models['fr'].append(load_model('fr_core_news_md'))
|
||||
if 'it' in models:
|
||||
models['it'].append(load_model('it_core_news_sm'))
|
||||
if 'nl' in models:
|
||||
models['nl'].append(load_model('nl_core_news_sm'))
|
||||
if 'pt' in models:
|
||||
models['pt'].append(load_model('pt_core_news_sm'))
|
||||
news_languages = ["da", "de", "el", "es", "fr", "it", "ja", "lt", "nb", "nl", "pl", "pt", "ro"]
|
||||
news_languages = ["nb"]
|
||||
web_languages = ["en", "zh"]
|
||||
sizes = ["sm", "md", "lg"]
|
||||
for lang in web_languages:
|
||||
UD_lang = lang
|
||||
for size in sizes:
|
||||
model_name = f'{lang}_core_web_{size}'
|
||||
try:
|
||||
models[UD_lang].append(load_model(model_name))
|
||||
except Exception as e:
|
||||
print(f"Error loading {model_name}: {e}")
|
||||
|
||||
for lang in news_languages:
|
||||
UD_lang = lang
|
||||
if lang == "nb":
|
||||
UD_lang = "no"
|
||||
for size in sizes:
|
||||
model_name = f'{lang}_core_news_{size}'
|
||||
try:
|
||||
models[UD_lang].append(load_model(model_name))
|
||||
except Exception as e:
|
||||
print(f"Error loading {model_name}: {e}")
|
||||
|
||||
with out_path.open(mode='w', encoding='utf-8') as out_file:
|
||||
run_all_evals(models, treebanks, out_file, check_parse, print_freq_tasks)
|
||||
|
|
|
@ -13,23 +13,12 @@ import srsly
|
|||
import spacy
|
||||
import spacy.util
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.util import compounding, minibatch_by_words
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
# from spacy.morphology import Fused_begin, Fused_inside
|
||||
from spacy import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
|
||||
Fused_begin = None
|
||||
Fused_inside = None
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from . import conll17_ud_eval
|
||||
|
||||
from spacy import lang
|
||||
|
@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
|
|||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
def initialize_pipeline(nlp, examples, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
return nlp
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ import spacy
|
|||
import spacy.util
|
||||
from bin.ud import conll17_ud_eval
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import Example
|
||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -53,7 +53,7 @@ def read_data(
|
|||
max_doc_length=None,
|
||||
limit=None,
|
||||
):
|
||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
"""Read the CONLLU format into Example objects. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True."""
|
||||
|
@ -78,47 +78,41 @@ def read_data(
|
|||
head = int(head) - 1 if head != "0" else id_
|
||||
sent["words"].append(word)
|
||||
sent["tags"].append(tag)
|
||||
sent["morphology"].append(_parse_morph_string(morph))
|
||||
sent["morphology"][-1].add("POS_%s" % pos)
|
||||
sent["morphs"].append(_compile_morph_string(morph, pos))
|
||||
sent["heads"].append(head)
|
||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||
sent["spaces"].append(space_after == "_")
|
||||
sent["entities"] = ["-"] * len(sent["words"])
|
||||
sent["entities"] = ["-"] * len(sent["words"]) # TODO: doc-level format
|
||||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
assert golds[-1].morphology is not None
|
||||
golds.append(sent)
|
||||
assert golds[-1]["morphs"] is not None
|
||||
|
||||
sent_annots.append(sent)
|
||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
assert gold.morphology is not None
|
||||
assert gold["morphs"] is not None
|
||||
sent_annots = []
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
if raw_text and sent_annots:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
def _parse_morph_string(morph_string):
|
||||
|
||||
def _compile_morph_string(morph_string, pos):
|
||||
if morph_string == '_':
|
||||
return set()
|
||||
output = []
|
||||
replacements = {'1': 'one', '2': 'two', '3': 'three'}
|
||||
for feature in morph_string.split('|'):
|
||||
key, value = feature.split('=')
|
||||
value = replacements.get(value, value)
|
||||
value = value.split(',')[0]
|
||||
output.append('%s_%s' % (key, value.lower()))
|
||||
return set(output)
|
||||
return f"POS={pos}"
|
||||
return morph_string + f"|POS={pos}"
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
docs = []
|
||||
|
@ -149,28 +143,27 @@ def read_conllu(file_):
|
|||
|
||||
def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||
# Flatten the conll annotations, and adjust the head indices
|
||||
flat = defaultdict(list)
|
||||
gold = defaultdict(list)
|
||||
sent_starts = []
|
||||
for sent in sent_annots:
|
||||
flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
|
||||
flat[field].extend(sent[field])
|
||||
gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]:
|
||||
gold[field].extend(sent[field])
|
||||
sent_starts.append(True)
|
||||
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||
# Construct text if necessary
|
||||
assert len(flat["words"]) == len(flat["spaces"])
|
||||
assert len(gold["words"]) == len(gold["spaces"])
|
||||
if text is None:
|
||||
text = "".join(
|
||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||
word + " " * space for word, space in zip(gold["words"], gold["spaces"])
|
||||
)
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop("spaces")
|
||||
gold = GoldParse(doc, **flat)
|
||||
gold.sent_starts = sent_starts
|
||||
for i in range(len(gold.heads)):
|
||||
gold.pop("spaces")
|
||||
gold["sent_starts"] = sent_starts
|
||||
for i in range(len(gold["heads"])):
|
||||
if random.random() < drop_deps:
|
||||
gold.heads[i] = None
|
||||
gold.labels[i] = None
|
||||
gold["heads"][i] = None
|
||||
gold["labels"][i] = None
|
||||
|
||||
return doc, gold
|
||||
|
||||
|
@ -180,16 +173,13 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
#############################
|
||||
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
tuples = []
|
||||
def golds_to_gold_data(docs, golds):
|
||||
"""Get out the training data format used by begin_training"""
|
||||
data = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
||||
tuples.append((text, sents))
|
||||
return tuples
|
||||
example = Example.from_dict(doc, dict(gold))
|
||||
data.append(example)
|
||||
return data
|
||||
|
||||
|
||||
##############
|
||||
|
@ -313,7 +303,9 @@ def get_token_conllu(token, i):
|
|||
feat_str = []
|
||||
replacements = {"one": "1", "two": "2", "three": "3"}
|
||||
for feat in features:
|
||||
if not feat.startswith("begin") and not feat.startswith("end"):
|
||||
if "=" in feat:
|
||||
feat_str.append(feat)
|
||||
elif not feat.startswith("begin") and not feat.startswith("end"):
|
||||
key, value = feat.split("_", 1)
|
||||
value = replacements.get(value, value)
|
||||
feat_str.append("%s=%s" % (key, value.title()))
|
||||
|
@ -327,7 +319,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
@ -348,7 +339,7 @@ def load_nlp(corpus, config, vectors=None):
|
|||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
def initialize_pipeline(nlp, examples, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
|
||||
nlp.add_pipe(nlp.create_pipe("morphologizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
|
@ -356,14 +347,14 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
|||
nlp.parser.add_multitask_objective("tag")
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective("sent_start")
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
for eg in examples:
|
||||
for tag in eg.get_aligned("TAG", as_string=True):
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
if torch is not None and device != -1:
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
optimizer = nlp.begin_training(
|
||||
lambda: golds_to_gold_tuples(docs, golds),
|
||||
lambda: examples,
|
||||
device=device,
|
||||
subword_features=config.subword_features,
|
||||
conv_depth=config.conv_depth,
|
||||
|
@ -382,8 +373,8 @@ def _load_pretrained_tok2vec(nlp, loc):
|
|||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
|
||||
component.get_ref("tok2vec").from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
||||
|
@ -505,7 +496,7 @@ def main(
|
|||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||
|
||||
docs, golds = read_data(
|
||||
examples = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
|
@ -513,12 +504,12 @@ def main(
|
|||
limit=limit,
|
||||
)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
||||
optimizer = initialize_pipeline(nlp, examples, config, gpu_device)
|
||||
|
||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||
for i in range(config.nr_epoch):
|
||||
docs, golds = read_data(
|
||||
examples = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
|
@ -527,22 +518,19 @@ def main(
|
|||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments,
|
||||
)
|
||||
Xs = list(zip(docs, golds))
|
||||
random.shuffle(Xs)
|
||||
random.shuffle(examples)
|
||||
if config.batch_by_words:
|
||||
batches = minibatch_by_words(Xs, size=batch_sizes)
|
||||
batches = minibatch_by_words(examples, size=batch_sizes)
|
||||
else:
|
||||
batches = minibatch(Xs, size=batch_sizes)
|
||||
batches = minibatch(examples, size=batch_sizes)
|
||||
losses = {}
|
||||
n_train_words = sum(len(doc) for doc in docs)
|
||||
n_train_words = sum(len(eg.predicted) for eg in examples)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
pbar.update(sum(len(ex.predicted) for ex in batch))
|
||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||
nlp.update(
|
||||
batch_docs,
|
||||
batch_gold,
|
||||
batch,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
|
|
|
@ -14,7 +14,7 @@ pip install keras==2.0.9
|
|||
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
|
||||
import ml_datasets
|
||||
import plac
|
||||
import random
|
||||
import pathlib
|
||||
|
@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
|
|||
from keras.layers import LSTM, Dense, Embedding, Bidirectional
|
||||
from keras.layers import TimeDistributed
|
||||
from keras.optimizers import Adam
|
||||
import thinc.extra.datasets
|
||||
from spacy.compat import pickle
|
||||
import spacy
|
||||
|
||||
|
@ -224,7 +223,7 @@ def main(
|
|||
if model_dir is not None:
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if train_dir is None or dev_dir is None:
|
||||
imdb_data = thinc.extra.datasets.imdb()
|
||||
imdb_data = ml_datasets.imdb()
|
||||
if is_runtime:
|
||||
if dev_dir is None:
|
||||
dev_texts, dev_labels = zip(*imdb_data[1])
|
||||
|
|
119
examples/experiments/onto-joint/defaults.cfg
Normal file
119
examples/experiments/onto-joint/defaults.cfg
Normal file
|
@ -0,0 +1,119 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 5000
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 200
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "words"
|
||||
use_gpu = -1
|
||||
raw_text = null
|
||||
tag_map = null
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = false
|
||||
eps = 1e-8
|
||||
#learn_rate = 0.001
|
||||
|
||||
[training.optimizer.learn_rate]
|
||||
@schedules = "warmup_linear.v1"
|
||||
warmup_steps = 250
|
||||
total_steps = 20000
|
||||
initial_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
base_model = null
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline]
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 30
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 128
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 7000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = ${training:dropout}
|
152
examples/experiments/onto-joint/pretrain.cfg
Normal file
152
examples/experiments/onto-joint/pretrain.cfg
Normal file
|
@ -0,0 +1,152 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 0
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 400
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "words"
|
||||
use_gpu = -1
|
||||
raw_text = null
|
||||
tag_map = null
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[pretraining]
|
||||
max_epochs = 1000
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
dropout = 0.2
|
||||
n_save_every = null
|
||||
batch_size = 3000
|
||||
seed = ${training:seed}
|
||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
||||
tok2vec_model = "nlp.pipeline.tok2vec.model"
|
||||
|
||||
[pretraining.objective]
|
||||
type = "characters"
|
||||
n_characters = 4
|
||||
|
||||
[pretraining.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
base_model = null
|
||||
|
||||
[nlp.pipeline]
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.senter]
|
||||
factory = "senter"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.senter.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.senter.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 256
|
||||
depth = 6
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
73
examples/experiments/onto-ner.cfg
Normal file
73
examples/experiments/onto-ner.cfg
Normal file
|
@ -0,0 +1,73 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 3000
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 100000
|
||||
max_epochs = 0
|
||||
max_steps = 0
|
||||
eval_frequency = 1000
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1.0}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
batch_by = "words"
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = ${training:dropout}
|
73
examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
Normal file
73
examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
Normal file
|
@ -0,0 +1,73 @@
|
|||
[training]
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
dropout = 0.2
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = 0
|
||||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedBiLSTM.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
subword_features = true
|
||||
maxout_pieces = 3
|
||||
dropout = null
|
74
examples/experiments/ptb-joint-pos-dep/defaults.cfg
Normal file
74
examples/experiments/ptb-joint-pos-dep/defaults.cfg
Normal file
|
@ -0,0 +1,74 @@
|
|||
[training]
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
dropout = 0.2
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = -1
|
||||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
69
examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
Normal file
69
examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
Normal file
|
@ -0,0 +1,69 @@
|
|||
[training]
|
||||
use_gpu = -1
|
||||
limit = 0
|
||||
dropout = 0.2
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
scores = ["ents_f"]
|
||||
score_weights = {"ents_f": 1}
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
batch_size = 25
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract]
|
||||
@architectures = "spacy.CharacterEmbed.v1"
|
||||
width = 96
|
||||
nM = 64
|
||||
nC = 8
|
||||
rows = 2000
|
||||
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||
dropout = null
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract.features]
|
||||
@architectures = "spacy.Doc2Feats.v1"
|
||||
columns = ${nlp.pipeline.tok2vec.model.extract:columns}
|
||||
|
||||
[nlp.pipeline.tok2vec.model.embed]
|
||||
@architectures = "spacy.LayerNormalizedMaxout.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
||||
maxout_pieces = 4
|
||||
|
||||
[nlp.pipeline.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
||||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
depth = 2
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
48
examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
Normal file
48
examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
Normal file
|
@ -0,0 +1,48 @@
|
|||
[training]
|
||||
use_gpu = -1
|
||||
limit = 0
|
||||
dropout = 0.2
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
scores = ["ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1}
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 3000
|
||||
stop = 3000
|
||||
compound = 1.001
|
||||
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "simple_ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.BiluoTagger.v1"
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
width = 128
|
||||
depth = 4
|
||||
embed_size = 7000
|
||||
maxout_pieces = 3
|
||||
window_size = 1
|
||||
subword_features = true
|
||||
pretrained_vectors = null
|
||||
dropout = null
|
|
@ -13,9 +13,10 @@ Prerequisites: pip install joblib
|
|||
from __future__ import print_function, unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import ml_datasets
|
||||
from joblib import Parallel, delayed
|
||||
from functools import partial
|
||||
import thinc.extra.datasets
|
||||
import plac
|
||||
import spacy
|
||||
from spacy.util import minibatch
|
||||
|
@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
|
|||
output_dir.mkdir()
|
||||
# load and pre-process the IMBD dataset
|
||||
print("Loading IMDB data...")
|
||||
data, _ = thinc.extra.datasets.imdb()
|
||||
data, _ = ml_datasets.imdb()
|
||||
texts, _ = zip(*data[-limit:])
|
||||
print("Processing texts...")
|
||||
partitions = minibatch(texts, size=batch_size)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Example of a Streamlit app for an interactive spaCy model visualizer. You can
|
||||
either download the script, or point streamlit run to the raw URL of this
|
||||
either download the script, or point `streamlit run` to the raw URL of this
|
||||
file. For more details, see https://streamlit.io.
|
||||
|
||||
Installation:
|
||||
|
@ -15,6 +15,8 @@ streamlit run streamlit_spacy.py
|
|||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import base64
|
||||
|
||||
import streamlit as st
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
@ -54,6 +56,14 @@ model_load_state.empty()
|
|||
text = st.text_area("Text to analyze", DEFAULT_TEXT)
|
||||
doc = process_text(spacy_model, text)
|
||||
|
||||
|
||||
def render_svg(svg):
|
||||
"""Renders the given svg string."""
|
||||
b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
|
||||
html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
|
||||
st.write(html, unsafe_allow_html=True)
|
||||
|
||||
|
||||
if "parser" in nlp.pipe_names:
|
||||
st.header("Dependency Parse & Part-of-speech tags")
|
||||
st.sidebar.header("Dependency Parse")
|
||||
|
@ -68,12 +78,14 @@ if "parser" in nlp.pipe_names:
|
|||
}
|
||||
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
|
||||
for sent in docs:
|
||||
html = displacy.render(sent, options=options)
|
||||
html = displacy.render(sent, options=options, style="dep")
|
||||
# Double newlines seem to mess with the rendering
|
||||
html = html.replace("\n\n", "\n")
|
||||
if split_sents and len(docs) > 1:
|
||||
st.markdown(f"> {sent.text}")
|
||||
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||
render_svg(html)
|
||||
# this didn't show the dep arc labels properly, cf #5089
|
||||
# st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||
|
||||
if "ner" in nlp.pipe_names:
|
||||
st.header("Named Entities")
|
||||
|
|
|
@ -12,7 +12,7 @@ import tqdm
|
|||
import spacy
|
||||
import spacy.util
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import Example
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from collections import defaultdict
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -33,31 +33,6 @@ random.seed(0)
|
|||
numpy.random.seed(0)
|
||||
|
||||
|
||||
def minibatch_by_words(items, size=5000):
|
||||
random.shuffle(items)
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
while True:
|
||||
batch_size = next(size_)
|
||||
batch = []
|
||||
while batch_size >= 0:
|
||||
try:
|
||||
doc, gold = next(items)
|
||||
except StopIteration:
|
||||
if batch:
|
||||
yield batch
|
||||
return
|
||||
batch_size -= len(doc)
|
||||
batch.append((doc, gold))
|
||||
if batch:
|
||||
yield batch
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
################
|
||||
# Data reading #
|
||||
################
|
||||
|
@ -78,7 +53,7 @@ def read_data(
|
|||
max_doc_length=None,
|
||||
limit=None,
|
||||
):
|
||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
"""Read the CONLLU format into Example objects. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True."""
|
||||
|
@ -110,7 +85,7 @@ def read_data(
|
|||
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
|
||||
if oracle_segments:
|
||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||
golds.append(GoldParse(docs[-1], **sent))
|
||||
golds.append(sent)
|
||||
|
||||
sent_annots.append(sent)
|
||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
||||
|
@ -119,15 +94,15 @@ def read_data(
|
|||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
if raw_text and sent_annots:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
|
@ -159,20 +134,19 @@ def read_conllu(file_):
|
|||
|
||||
def _make_gold(nlp, text, sent_annots):
|
||||
# Flatten the conll annotations, and adjust the head indices
|
||||
flat = defaultdict(list)
|
||||
gold = defaultdict(list)
|
||||
for sent in sent_annots:
|
||||
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
|
||||
gold["heads"].extend(len(gold["words"]) + head for head in sent["heads"])
|
||||
for field in ["words", "tags", "deps", "entities", "spaces"]:
|
||||
flat[field].extend(sent[field])
|
||||
gold[field].extend(sent[field])
|
||||
# Construct text if necessary
|
||||
assert len(flat["words"]) == len(flat["spaces"])
|
||||
assert len(gold["words"]) == len(gold["spaces"])
|
||||
if text is None:
|
||||
text = "".join(
|
||||
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
|
||||
word + " " * space for word, space in zip(gold["words"], gold["spaces"])
|
||||
)
|
||||
doc = nlp.make_doc(text)
|
||||
flat.pop("spaces")
|
||||
gold = GoldParse(doc, **flat)
|
||||
gold.pop("spaces")
|
||||
return doc, gold
|
||||
|
||||
|
||||
|
@ -181,16 +155,13 @@ def _make_gold(nlp, text, sent_annots):
|
|||
#############################
|
||||
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
tuples = []
|
||||
def golds_to_gold_data(docs, golds):
|
||||
"""Get out the training data format used by begin_training."""
|
||||
data = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
||||
tuples.append((text, sents))
|
||||
return tuples
|
||||
example = Example.from_dict(doc, gold)
|
||||
data.append(example)
|
||||
return data
|
||||
|
||||
|
||||
##############
|
||||
|
@ -303,7 +274,7 @@ def load_nlp(corpus, config):
|
|||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config):
|
||||
def initialize_pipeline(nlp, examples, config):
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
if config.multitask_tag:
|
||||
nlp.parser.add_multitask_objective("tag")
|
||||
|
@ -311,18 +282,19 @@ def initialize_pipeline(nlp, docs, golds, config):
|
|||
nlp.parser.add_multitask_objective("sent_start")
|
||||
nlp.parser.moves.add_action(2, "subtok")
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
for eg in examples:
|
||||
for tag in eg.get_aligned("TAG", as_string=True):
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
# Replace labels that didn't make the frequency cutoff
|
||||
actions = set(nlp.parser.labels)
|
||||
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
||||
for gold in golds:
|
||||
for eg in examples:
|
||||
gold = eg.gold
|
||||
for i, label in enumerate(gold.labels):
|
||||
if label is not None and label not in label_set:
|
||||
gold.labels[i] = label.split("||")[0]
|
||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
||||
return nlp.begin_training(lambda: examples)
|
||||
|
||||
|
||||
########################
|
||||
|
@ -391,13 +363,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
paths = TreebankPaths(ud_dir, corpus)
|
||||
if not (parses_dir / corpus).exists():
|
||||
(parses_dir / corpus).mkdir()
|
||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config)
|
||||
|
||||
docs, golds = read_data(
|
||||
examples = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
|
@ -405,23 +381,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
limit=limit,
|
||||
)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
||||
optimizer = initialize_pipeline(nlp, examples, config)
|
||||
|
||||
for i in range(config.nr_epoch):
|
||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
||||
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
|
||||
batches = spacy.minibatch_by_words(examples, size=config.batch_size)
|
||||
losses = {}
|
||||
n_train_words = sum(len(doc) for doc in docs)
|
||||
n_train_words = sum(len(eg.reference.doc) for eg in examples)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
pbar.update(sum(len(eg.reference.doc) for eg in batch))
|
||||
nlp.update(
|
||||
batch_docs,
|
||||
batch_gold,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
|
||||
)
|
||||
|
||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||
|
|
|
@ -30,7 +30,7 @@ ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
|||
model=("Model name, should have pretrained word embeddings", "positional", None, str),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
)
|
||||
def main(model=None, output_dir=None):
|
||||
def main(model, output_dir=None):
|
||||
"""Load the model and create the KB with pre-defined entity encodings.
|
||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||
The updated vocab will also be written to a directory in the output_dir."""
|
||||
|
|
|
@ -24,21 +24,22 @@ import random
|
|||
import plac
|
||||
import spacy
|
||||
import os.path
|
||||
|
||||
from spacy.gold.example import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy.gold import read_json_file, GoldParse
|
||||
from spacy.gold import read_json_file
|
||||
|
||||
random.seed(0)
|
||||
|
||||
PWD = os.path.dirname(__file__)
|
||||
|
||||
TRAIN_DATA = list(read_json_file(
|
||||
os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
|
||||
TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
|
||||
|
||||
|
||||
def get_position_label(i, words, tags, heads, labels, ents):
|
||||
def get_position_label(i, token_annotation):
|
||||
"""Return labels indicating the position of the word in the document.
|
||||
"""
|
||||
if len(words) < 20:
|
||||
if len(token_annotation.words) < 20:
|
||||
return "short-doc"
|
||||
elif i == 0:
|
||||
return "first-word"
|
||||
|
@ -46,7 +47,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
|
|||
return "early-word"
|
||||
elif i < 20:
|
||||
return "mid-word"
|
||||
elif i == len(words) - 1:
|
||||
elif i == len(token_annotation.words) - 1:
|
||||
return "last-word"
|
||||
else:
|
||||
return "late-word"
|
||||
|
@ -60,27 +61,25 @@ def main(n_iter=10):
|
|||
print(nlp.pipeline)
|
||||
|
||||
print("Create data", len(TRAIN_DATA))
|
||||
optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annot_brackets in TRAIN_DATA:
|
||||
for annotations, _ in annot_brackets:
|
||||
doc = Doc(nlp.vocab, words=annotations[1])
|
||||
gold = GoldParse.from_annot_tuples(doc, annotations)
|
||||
nlp.update(
|
||||
[doc], # batch of texts
|
||||
[gold], # batch of annotations
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
sgd=optimizer, # callable to update weights
|
||||
losses=losses,
|
||||
)
|
||||
for example_dict in TRAIN_DATA:
|
||||
doc = Doc(nlp.vocab, words=example_dict["words"])
|
||||
example = Example.from_dict(doc, example_dict)
|
||||
nlp.update(
|
||||
examples=[example], # 1 example
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
sgd=optimizer, # callable to update weights
|
||||
losses=losses,
|
||||
)
|
||||
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
||||
|
||||
# test the trained model
|
||||
for text, _ in TRAIN_DATA:
|
||||
if text is not None:
|
||||
doc = nlp(text)
|
||||
for example_dict in TRAIN_DATA:
|
||||
if "text" in example_dict:
|
||||
doc = nlp(example_dict["text"])
|
||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||
|
||||
|
|
|
@ -1,217 +0,0 @@
|
|||
"""This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pretrained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||
pre-train with the development data, but also not *so* terrible: we're not using
|
||||
the development labels, after all --- only the unlabelled text.
|
||||
"""
|
||||
import plac
|
||||
import tqdm
|
||||
import random
|
||||
import spacy
|
||||
import thinc.extra.datasets
|
||||
from spacy.util import minibatch, use_gpu, compounding
|
||||
from spacy._ml import Tok2Vec
|
||||
from spacy.pipeline import TextCategorizer
|
||||
import numpy
|
||||
|
||||
|
||||
def load_texts(limit=0):
|
||||
train, dev = thinc.extra.datasets.imdb()
|
||||
train_texts, train_labels = zip(*train)
|
||||
dev_texts, dev_labels = zip(*train)
|
||||
train_texts = list(train_texts)
|
||||
dev_texts = list(dev_texts)
|
||||
random.shuffle(train_texts)
|
||||
random.shuffle(dev_texts)
|
||||
if limit >= 1:
|
||||
return train_texts[:limit]
|
||||
else:
|
||||
return list(train_texts) + list(dev_texts)
|
||||
|
||||
|
||||
def load_textcat_data(limit=0):
|
||||
"""Load data from the IMDB dataset."""
|
||||
# Partition off part of the train data for evaluation
|
||||
train_data, eval_data = thinc.extra.datasets.imdb()
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
texts, labels = zip(*train_data)
|
||||
eval_texts, eval_labels = zip(*eval_data)
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
||||
return (texts, cats), (eval_texts, eval_cats)
|
||||
|
||||
|
||||
def prefer_gpu():
|
||||
used = spacy.util.use_gpu(0)
|
||||
if used is None:
|
||||
return False
|
||||
else:
|
||||
import cupy.random
|
||||
|
||||
cupy.random.seed(0)
|
||||
return True
|
||||
|
||||
|
||||
def build_textcat_model(tok2vec, nr_class, width):
|
||||
from thinc.v2v import Model, Softmax, Maxout
|
||||
from thinc.api import flatten_add_lengths, chain
|
||||
from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
|
||||
from thinc.misc import Residual, LayerNorm
|
||||
from spacy._ml import logistic, zero_init
|
||||
|
||||
with Model.define_operators({">>": chain}):
|
||||
model = (
|
||||
tok2vec
|
||||
>> flatten_add_lengths
|
||||
>> Pooling(mean_pool)
|
||||
>> Softmax(nr_class, width)
|
||||
)
|
||||
model.tok2vec = tok2vec
|
||||
return model
|
||||
|
||||
|
||||
def block_gradients(model):
|
||||
from thinc.api import wrap
|
||||
|
||||
def forward(X, drop=0.0):
|
||||
Y, _ = model.begin_update(X, drop=drop)
|
||||
return Y, None
|
||||
|
||||
return wrap(forward, model)
|
||||
|
||||
|
||||
def create_pipeline(width, embed_size, vectors_model):
|
||||
print("Load vectors")
|
||||
nlp = spacy.load(vectors_model)
|
||||
print("Start training")
|
||||
textcat = TextCategorizer(
|
||||
nlp.vocab,
|
||||
labels=["POSITIVE", "NEGATIVE"],
|
||||
model=build_textcat_model(
|
||||
Tok2Vec(width=width, embed_size=embed_size), 2, width
|
||||
),
|
||||
)
|
||||
|
||||
nlp.add_pipe(textcat)
|
||||
return nlp
|
||||
|
||||
|
||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||
tensorizer = nlp.create_pipe("tensorizer")
|
||||
nlp.add_pipe(tensorizer)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
||||
docs = [nlp.make_doc(text) for text in batch]
|
||||
tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
|
||||
print(losses)
|
||||
return optimizer
|
||||
|
||||
|
||||
def train_textcat(nlp, n_texts, n_iter=10):
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||
print(
|
||||
"Using {} examples ({} training, {} evaluation)".format(
|
||||
n_texts, len(train_texts), len(dev_texts)
|
||||
)
|
||||
)
|
||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||
optimizer = nlp.begin_training()
|
||||
textcat.model.tok2vec.from_bytes(tok2vec_weights)
|
||||
print("Training the model...")
|
||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||
for i in range(n_iter):
|
||||
losses = {"textcat": 0.0}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
# evaluate on the dev data split off in load_data()
|
||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||
print(
|
||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
||||
losses["textcat"],
|
||||
scores["textcat_p"],
|
||||
scores["textcat_r"],
|
||||
scores["textcat_f"],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||
docs = (tokenizer(text) for text in texts)
|
||||
tp = 1e-8
|
||||
fp = 1e-8
|
||||
tn = 1e-8
|
||||
fn = 1e-8
|
||||
for i, doc in enumerate(textcat.pipe(docs)):
|
||||
gold = cats[i]
|
||||
for label, score in doc.cats.items():
|
||||
if label not in gold:
|
||||
continue
|
||||
if score >= 0.5 and gold[label] >= 0.5:
|
||||
tp += 1.0
|
||||
elif score >= 0.5 and gold[label] < 0.5:
|
||||
fp += 1.0
|
||||
elif score < 0.5 and gold[label] < 0.5:
|
||||
tn += 1
|
||||
elif score < 0.5 and gold[label] >= 0.5:
|
||||
fn += 1
|
||||
precision = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
f_score = 2 * (precision * recall) / (precision + recall)
|
||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
width=("Width of CNN layers", "positional", None, int),
|
||||
embed_size=("Embedding rows", "positional", None, int),
|
||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
||||
train_iters=("Number of iterations to train", "option", "tn", int),
|
||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
)
|
||||
def main(
|
||||
width,
|
||||
embed_size,
|
||||
vectors_model,
|
||||
pretrain_iters=30,
|
||||
train_iters=30,
|
||||
train_examples=1000,
|
||||
):
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
use_gpu = prefer_gpu()
|
||||
print("Using GPU?", use_gpu)
|
||||
|
||||
nlp = create_pipeline(width, embed_size, vectors_model)
|
||||
print("Load data")
|
||||
texts = load_texts(limit=0)
|
||||
print("Train tensorizer")
|
||||
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
|
||||
print("Train textcat")
|
||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -4,9 +4,10 @@ import random
|
|||
import warnings
|
||||
import srsly
|
||||
import spacy
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import Example
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
# TODO: further fix & test this script for v.3 ? (read_gold_data is never called)
|
||||
|
||||
LABEL = "ANIMAL"
|
||||
TRAIN_DATA = [
|
||||
|
@ -32,19 +33,17 @@ def read_raw_data(nlp, jsonl_loc):
|
|||
for json_obj in srsly.read_jsonl(jsonl_loc):
|
||||
if json_obj["text"].strip():
|
||||
doc = nlp.make_doc(json_obj["text"])
|
||||
yield doc
|
||||
yield Example.from_dict(doc, {})
|
||||
|
||||
|
||||
def read_gold_data(nlp, gold_loc):
|
||||
docs = []
|
||||
golds = []
|
||||
examples = []
|
||||
for json_obj in srsly.read_jsonl(gold_loc):
|
||||
doc = nlp.make_doc(json_obj["text"])
|
||||
ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
|
||||
gold = GoldParse(doc, entities=ents)
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
return list(zip(docs, golds))
|
||||
example = Example.from_dict(doc, {"entities": ents})
|
||||
examples.append(example)
|
||||
return examples
|
||||
|
||||
|
||||
def main(model_name, unlabelled_loc):
|
||||
|
@ -53,34 +52,34 @@ def main(model_name, unlabelled_loc):
|
|||
batch_size = 4
|
||||
nlp = spacy.load(model_name)
|
||||
nlp.get_pipe("ner").add_label(LABEL)
|
||||
raw_docs = list(read_raw_data(nlp, unlabelled_loc))
|
||||
raw_examples = list(read_raw_data(nlp, unlabelled_loc))
|
||||
optimizer = nlp.resume_training()
|
||||
# Avoid use of Adam when resuming training. I don't understand this well
|
||||
# yet, but I'm getting weird results from Adam. Try commenting out the
|
||||
# nlp.update(), and using Adam -- you'll find the models drift apart.
|
||||
# I guess Adam is losing precision, introducing gradient noise?
|
||||
optimizer.alpha = 0.1
|
||||
optimizer.learn_rate = 0.1
|
||||
optimizer.b1 = 0.0
|
||||
optimizer.b2 = 0.0
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
sizes = compounding(1.0, 4.0, 1.001)
|
||||
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
||||
with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(raw_docs)
|
||||
random.shuffle(train_examples)
|
||||
random.shuffle(raw_examples)
|
||||
losses = {}
|
||||
r_losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
raw_batches = minibatch(raw_docs, size=4)
|
||||
for batch in minibatch(TRAIN_DATA, size=sizes):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
|
||||
raw_batches = minibatch(raw_examples, size=4)
|
||||
for batch in minibatch(train_examples, size=sizes):
|
||||
nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
|
||||
raw_batch = list(next(raw_batches))
|
||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
|
||||
print("Losses", losses)
|
||||
|
|
|
@ -5,16 +5,17 @@ from spacy.gold import docs_to_json
|
|||
import srsly
|
||||
import sys
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
||||
input_file=("Input file (jsonl)", "positional", None, Path),
|
||||
output_dir=("Output directory", "positional", None, Path),
|
||||
n_texts=("Number of texts to convert", "option", "t", int),
|
||||
)
|
||||
def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
||||
def convert(model="en", input_file=None, output_dir=None, n_texts=0):
|
||||
# Load model with tokenizer + sentencizer only
|
||||
nlp = spacy.load(model)
|
||||
nlp.disable_pipes(*nlp.pipe_names)
|
||||
nlp.select_pipes(disable=nlp.pipe_names)
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
nlp.add_pipe(sentencizer, first=True)
|
||||
|
||||
|
@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
|||
|
||||
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(convert)
|
||||
|
|
|
@ -16,11 +16,10 @@ from __future__ import unicode_literals, print_function
|
|||
import plac
|
||||
import random
|
||||
from pathlib import Path
|
||||
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
|
||||
from spacy.gold import Example
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
@ -70,32 +69,34 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
|
|||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
||||
|
||||
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
|
||||
# Note that in a realistic application, an actual NER algorithm should be used instead.
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
entity_linker.set_kb(kb)
|
||||
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"kb": kb, "incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
|
||||
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
|
||||
TRAIN_DOCS = []
|
||||
train_examples = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
with nlp.disable_pipes("entity_linker"):
|
||||
with nlp.select_pipes(disable="entity_linker"):
|
||||
doc = nlp(text)
|
||||
annotation_clean = annotation
|
||||
for offset, kb_id_dict in annotation["links"].items():
|
||||
|
@ -108,24 +109,20 @@ def main(kb_path, vocab_path, output_dir=None, n_iter=50):
|
|||
"Removed", kb_id, "from training because it is not in the KB."
|
||||
)
|
||||
annotation_clean["links"][offset] = new_dict
|
||||
TRAIN_DOCS.append((doc, annotation_clean))
|
||||
train_examples.append(Example.from_dict(doc, annotation_clean))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train entity linker
|
||||
with nlp.select_pipes(enable="entity_linker"): # only train entity linker
|
||||
# reset and initialize the weights randomly
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DOCS)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
texts, # batch of texts
|
||||
annotations, # batch of annotations
|
||||
batch,
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
losses=losses,
|
||||
sgd=optimizer,
|
||||
|
|
|
@ -23,6 +23,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.gold import Example
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
|
@ -120,22 +121,21 @@ def main(model=None, output_dir=None, n_iter=15):
|
|||
parser = nlp.create_pipe("parser")
|
||||
nlp.add_pipe(parser, first=True)
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
|
||||
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
||||
with nlp.select_pipes(enable="parser"): # only train parser
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
136
examples/training/train_morphologizer.py
Normal file
136
examples/training/train_morphologizer.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
"""
|
||||
A simple example for training a morphologizer. For more details, see
|
||||
the documentation:
|
||||
* Training: https://spacy.io/usage/training
|
||||
|
||||
Compatible with: spaCy v3.0.0+
|
||||
Last tested with: v3.0.0
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.gold import Example
|
||||
from spacy.util import minibatch, compounding
|
||||
from spacy.morphology import Morphology
|
||||
|
||||
|
||||
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
||||
# strings are unicode and that the number of tags assigned matches spaCy's
|
||||
# tokenization. If not, you can always add a 'words' key to the annotations
|
||||
# that specifies the gold-standard tokenization, e.g.:
|
||||
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"I like green eggs",
|
||||
{
|
||||
"morphs": [
|
||||
"PronType=Prs|Person=1",
|
||||
"VerbForm=Fin",
|
||||
"Degree=Pos",
|
||||
"Number=Plur",
|
||||
],
|
||||
"pos": ["PRON", "VERB", "ADJ", "NOUN"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"Eat blue ham",
|
||||
{
|
||||
"morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
|
||||
"pos": ["VERB", "ADJ", "NOUN"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"She was blue",
|
||||
{
|
||||
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
|
||||
"pos": ["PRON", "VERB", "ADJ"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"He was blue today",
|
||||
{
|
||||
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
|
||||
"pos": ["PRON", "VERB", "ADJ", "ADV"],
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
# The POS tags are optional, set `with_pos_tags = False` to omit them for
|
||||
# this example:
|
||||
with_pos_tags = True
|
||||
|
||||
if not with_pos_tags:
|
||||
for i in range(len(TRAIN_DATA)):
|
||||
del TRAIN_DATA[i][1]["pos"]
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("ISO Code of language to use", "option", "l", str),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_iter=("Number of training iterations", "option", "n", int),
|
||||
)
|
||||
def main(lang="en", output_dir=None, n_iter=25):
|
||||
"""Create a new model, set up the pipeline and train the tagger. In order to
|
||||
train the tagger with a custom tag map, we're creating a new Language
|
||||
instance with a custom vocab.
|
||||
"""
|
||||
nlp = spacy.blank(lang)
|
||||
# add the tagger to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
morphologizer = nlp.create_pipe("morphologizer")
|
||||
nlp.add_pipe(morphologizer)
|
||||
|
||||
# add labels and create the Example instances
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
morph_labels = annotations.get("morphs")
|
||||
pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
|
||||
assert len(morph_labels) == len(pos_labels)
|
||||
for morph, pos in zip(morph_labels, pos_labels):
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
morph_dict["POS"] = pos
|
||||
morph = Morphology.dict_to_feats(morph_dict)
|
||||
morphologizer.add_label(morph)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
print("Morphs", [(t.text, t.morph) for t in doc])
|
||||
|
||||
# save model to output directory
|
||||
if output_dir is not None:
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
print("Saved model to", output_dir)
|
||||
|
||||
# test the save model
|
||||
print("Loading from", output_dir)
|
||||
nlp2 = spacy.load(output_dir)
|
||||
doc = nlp2(test_text)
|
||||
print("Morphs", [(t.text, t.morph) for t in doc])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
||||
|
||||
# Expected output:
|
||||
# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
|
|
@ -17,6 +17,7 @@ import random
|
|||
import warnings
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.gold import Example
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
|
@ -43,41 +44,41 @@ def main(model=None, output_dir=None, n_iter=100):
|
|||
|
||||
# create the built-in pipeline components and add them to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
if "ner" not in nlp.pipe_names:
|
||||
ner = nlp.create_pipe("ner")
|
||||
if "simple_ner" not in nlp.pipe_names:
|
||||
ner = nlp.create_pipe("simple_ner")
|
||||
nlp.add_pipe(ner, last=True)
|
||||
# otherwise, get it so we can add labels
|
||||
else:
|
||||
ner = nlp.get_pipe("ner")
|
||||
ner = nlp.get_pipe("simple_ner")
|
||||
|
||||
# add labels
|
||||
for _, annotations in TRAIN_DATA:
|
||||
# add labels and create Example objects
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for ent in annotations.get("entities"):
|
||||
print("Add label", ent[2])
|
||||
ner.add_label(ent[2])
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
# only train NER
|
||||
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
|
||||
with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||
|
||||
# reset and initialize the weights randomly – but only if we're
|
||||
# training a new model
|
||||
if model is None:
|
||||
nlp.begin_training()
|
||||
print(
|
||||
"Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names()))
|
||||
)
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
texts, # batch of texts
|
||||
annotations, # batch of annotations
|
||||
drop=0.5, # dropout - make it harder to memorise data
|
||||
batch,
|
||||
drop=0.0, # dropout - make it harder to memorise data
|
||||
losses=losses,
|
||||
)
|
||||
print("Losses", losses)
|
||||
|
|
|
@ -80,6 +80,10 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
|||
print("Created blank 'en' model")
|
||||
# Add entity recognizer to model if it's not in the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
train_examples = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
train_examples.append(TRAIN_DATA.from_dict(nlp(text), annotation))
|
||||
|
||||
if "ner" not in nlp.pipe_names:
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
|
@ -95,23 +99,18 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
|||
else:
|
||||
optimizer = nlp.resume_training()
|
||||
move_names = list(ner.move_names)
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
# only train NER
|
||||
with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
|
||||
with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||
|
||||
sizes = compounding(1.0, 4.0, 1.001)
|
||||
# batch up the examples using spaCy's minibatch
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
batches = minibatch(TRAIN_DATA, size=sizes)
|
||||
random.shuffle(train_examples)
|
||||
batches = minibatch(train_examples, size=sizes)
|
||||
losses = {}
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -14,6 +14,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.gold import Example
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
|
@ -59,24 +60,22 @@ def main(model=None, output_dir=None, n_iter=15):
|
|||
else:
|
||||
parser = nlp.get_pipe("parser")
|
||||
|
||||
# add labels to the parser
|
||||
for _, annotations in TRAIN_DATA:
|
||||
# add labels to the parser and create the Example objects
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
||||
with nlp.select_pipes(enable="parser"): # only train parser
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -17,6 +17,7 @@ import plac
|
|||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.gold import Example
|
||||
from spacy.util import minibatch, compounding
|
||||
|
||||
|
||||
|
@ -58,15 +59,18 @@ def main(lang="en", output_dir=None, n_iter=25):
|
|||
tagger.add_label(tag, values)
|
||||
nlp.add_pipe(tagger)
|
||||
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
random.shuffle(train_examples)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -2,89 +2,86 @@
|
|||
# coding: utf8
|
||||
"""Train a convolutional neural network text classifier on the
|
||||
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
|
||||
automatically via Thinc's built-in dataset loader. The model is added to
|
||||
automatically via the package `ml_datasets`. The model is added to
|
||||
spacy.pipeline, and predictions are available via `doc.cats`. For more details,
|
||||
see the documentation:
|
||||
* Training: https://spacy.io/usage/training
|
||||
|
||||
Compatible with: spaCy v2.0.0+
|
||||
Compatible with: spaCy v3.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import random
|
||||
from pathlib import Path
|
||||
import thinc.extra.datasets
|
||||
from ml_datasets import loaders
|
||||
|
||||
import spacy
|
||||
from spacy import util
|
||||
from spacy.util import minibatch, compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||
config_path=("Path to config file", "positional", None, Path),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_texts=("Number of texts to train from", "option", "t", int),
|
||||
n_iter=("Number of training iterations", "option", "n", int),
|
||||
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
|
||||
dataset=("Dataset to train on (default: imdb)", "option", "d", str),
|
||||
threshold=("Min. number of instances for a given label (default 20)", "option", "m", int)
|
||||
)
|
||||
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
|
||||
def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20):
|
||||
if not config_path or not config_path.exists():
|
||||
raise ValueError(f"Config file not found at {config_path}")
|
||||
|
||||
spacy.util.fix_random_seed()
|
||||
if output_dir is not None:
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
|
||||
if model is not None:
|
||||
nlp = spacy.load(model) # load existing spaCy model
|
||||
print("Loaded model '%s'" % model)
|
||||
else:
|
||||
nlp = spacy.blank("en") # create blank Language class
|
||||
print("Created blank 'en' model")
|
||||
print(f"Loading nlp model from {config_path}")
|
||||
nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
|
||||
# add the text classifier to the pipeline if it doesn't exist
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
# ensure the nlp object was defined with a textcat component
|
||||
if "textcat" not in nlp.pipe_names:
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
|
||||
)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# otherwise, get it, so we can add labels to it
|
||||
else:
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
raise ValueError(f"The nlp definition in the config does not contain a textcat component")
|
||||
|
||||
# add label to text classifier
|
||||
textcat.add_label("POSITIVE")
|
||||
textcat.add_label("NEGATIVE")
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
|
||||
# load the IMDB dataset
|
||||
print("Loading IMDB data...")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
|
||||
train_texts = train_texts[:n_texts]
|
||||
train_cats = train_cats[:n_texts]
|
||||
# load the dataset
|
||||
print(f"Loading dataset {dataset} ...")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts)
|
||||
print(
|
||||
"Using {} examples ({} training, {} evaluation)".format(
|
||||
n_texts, len(train_texts), len(dev_texts)
|
||||
)
|
||||
)
|
||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||
train_examples = []
|
||||
for text, cats in zip(train_texts, train_cats):
|
||||
doc = nlp.make_doc(text)
|
||||
example = Example.from_dict(doc, {"cats": cats})
|
||||
for cat in cats:
|
||||
textcat.add_label(cat)
|
||||
train_examples.append(example)
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||
with nlp.select_pipes(enable="textcat"): # only train textcat
|
||||
optimizer = nlp.begin_training()
|
||||
if init_tok2vec is not None:
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
textcat.model.tok2vec.from_bytes(file_.read())
|
||||
textcat.model.get_ref("tok2vec").from_bytes(file_.read())
|
||||
print("Training the model...")
|
||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||
batch_sizes = compounding(4.0, 32.0, 1.001)
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
random.shuffle(train_data)
|
||||
batches = minibatch(train_data, size=batch_sizes)
|
||||
random.shuffle(train_examples)
|
||||
batches = minibatch(train_examples, size=batch_sizes)
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
# evaluate on the dev data split off in load_data()
|
||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||
|
@ -97,7 +94,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
|||
)
|
||||
)
|
||||
|
||||
# test the trained model
|
||||
# test the trained model (only makes sense for sentiment analysis)
|
||||
test_text = "This movie sucked"
|
||||
doc = nlp(test_text)
|
||||
print(test_text, doc.cats)
|
||||
|
@ -114,14 +111,48 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
|||
print(test_text, doc2.cats)
|
||||
|
||||
|
||||
def load_data(limit=0, split=0.8):
|
||||
"""Load data from the IMDB dataset."""
|
||||
def load_data(dataset, threshold, limit=0, split=0.8):
|
||||
"""Load data from the provided dataset."""
|
||||
# Partition off part of the train data for evaluation
|
||||
train_data, _ = thinc.extra.datasets.imdb()
|
||||
data_loader = loaders.get(dataset)
|
||||
train_data, _ = data_loader(limit=int(limit/split))
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
texts, labels = zip(*train_data)
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
|
||||
unique_labels = set()
|
||||
for label_set in labels:
|
||||
if isinstance(label_set, int) or isinstance(label_set, str):
|
||||
unique_labels.add(label_set)
|
||||
elif isinstance(label_set, list) or isinstance(label_set, set):
|
||||
unique_labels.update(label_set)
|
||||
unique_labels = sorted(unique_labels)
|
||||
print(f"# of unique_labels: {len(unique_labels)}")
|
||||
|
||||
count_values_train = dict()
|
||||
for text, annot_list in train_data:
|
||||
if isinstance(annot_list, int) or isinstance(annot_list, str):
|
||||
count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
|
||||
else:
|
||||
for annot in annot_list:
|
||||
count_values_train[annot] = count_values_train.get(annot, 0) + 1
|
||||
for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
|
||||
if count < threshold:
|
||||
unique_labels.remove(value)
|
||||
|
||||
print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}")
|
||||
|
||||
if unique_labels == {0, 1}:
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
else:
|
||||
cats = []
|
||||
for y in labels:
|
||||
if isinstance(y, str) or isinstance(y, int):
|
||||
cats.append({str(label): (label == y) for label in unique_labels})
|
||||
elif isinstance(y, set):
|
||||
cats.append({str(label): (label in y) for label in unique_labels})
|
||||
else:
|
||||
raise ValueError(f"Unrecognised type of labels: {type(y)}")
|
||||
|
||||
split = int(len(train_data) * split)
|
||||
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
|
||||
|
||||
|
|
19
examples/training/train_textcat_config.cfg
Normal file
19
examples/training/train_textcat_config.cfg
Normal file
|
@ -0,0 +1,19 @@
|
|||
[nlp]
|
||||
lang = "en"
|
||||
|
||||
[nlp.pipeline.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
[nlp.pipeline.textcat.model]
|
||||
@architectures = "spacy.TextCatCNN.v1"
|
||||
exclusive_classes = false
|
||||
|
||||
[nlp.pipeline.textcat.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
9
fabfile.py
vendored
9
fabfile.py
vendored
|
@ -1,9 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
from fabric.api import local, lcd, env, settings, prefix
|
||||
from fabric.api import local, lcd
|
||||
from os import path, environ
|
||||
import shutil
|
||||
import sys
|
||||
|
@ -82,9 +79,7 @@ def pex():
|
|||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
sha = local("git rev-parse --short HEAD", capture=True)
|
||||
venv_local(
|
||||
"pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
|
||||
)
|
||||
venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)
|
||||
|
||||
|
||||
def clean():
|
||||
|
|
|
@ -38,6 +38,13 @@ redirects = [
|
|||
{from = "/docs/usage/showcase", to = "/universe", force = true},
|
||||
{from = "/tutorials/load-new-word-vectors", to = "/usage/vectors-similarity#custom", force = true},
|
||||
{from = "/tutorials", to = "/usage/examples", force = true},
|
||||
# Old documentation pages (v2.x)
|
||||
{from = "/usage/adding-languages", to = "/usage/linguistic-features", force = true},
|
||||
{from = "/usage/vectors-similarity", to = "/usage/vectors-embeddings", force = true},
|
||||
{from = "/api/goldparse", to = "/api/top-level", force = true},
|
||||
{from = "/api/goldcorpus", to = "/api/corpus", force = true},
|
||||
{from = "/api/annotation", to = "/api/data-formats", force = true},
|
||||
{from = "/usage/examples", to = "/usage/projects", force = true},
|
||||
# Rewrite all other docs pages to /
|
||||
{from = "/docs/*", to = "/:splat"},
|
||||
# Updated documentation pages
|
||||
|
|
|
@ -6,6 +6,8 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc==7.4.1",
|
||||
"thinc>=8.0.0a17,<8.0.0a20",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -1,20 +1,24 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc==7.4.1
|
||||
thinc>=8.0.0a17,<8.0.0a20
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=1.0.2,<1.1.0
|
||||
wasabi>=0.7.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.6.5
|
||||
|
|
30
setup.cfg
30
setup.cfg
|
@ -16,10 +16,7 @@ classifiers =
|
|||
Operating System :: MacOS :: MacOS X
|
||||
Operating System :: Microsoft :: Windows
|
||||
Programming Language :: Cython
|
||||
Programming Language :: Python :: 2
|
||||
Programming Language :: Python :: 2.7
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3.5
|
||||
Programming Language :: Python :: 3.6
|
||||
Programming Language :: Python :: 3.7
|
||||
Programming Language :: Python :: 3.8
|
||||
|
@ -28,34 +25,41 @@ classifiers =
|
|||
[options]
|
||||
zip_safe = false
|
||||
include_package_data = true
|
||||
scripts =
|
||||
bin/spacy
|
||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
|
||||
python_requires = >=3.6
|
||||
setup_requires =
|
||||
wheel
|
||||
cython>=0.25
|
||||
numpy>=1.15.0
|
||||
# We also need our Cython packages here to compile against
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc==7.4.1
|
||||
thinc>=8.0.0a17,<8.0.0a20
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc==7.4.1
|
||||
thinc>=8.0.0a17,<8.0.0a20
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=1.0.2,<1.1.0
|
||||
wasabi>=0.7.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
typer>=0.3.0,<0.4.0
|
||||
# Third-party dependencies
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
setuptools
|
||||
numpy>=1.15.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
spacy = spacy.cli:app
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
|
|
201
setup.py
201
setup.py
|
@ -1,36 +1,30 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import sys
|
||||
import contextlib
|
||||
import platform
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.sysconfig import get_python_inc
|
||||
import distutils.util
|
||||
from distutils import ccompiler, msvccompiler
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
from Cython.Build import cythonize
|
||||
from Cython.Compiler import Options
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
|
||||
def is_new_osx():
|
||||
"""Check whether we're on OSX >= 10.10"""
|
||||
name = distutils.util.get_platform()
|
||||
if sys.platform != "darwin":
|
||||
return False
|
||||
elif name.startswith("macosx-10"):
|
||||
minor_version = int(name.split("-")[1].split(".")[1])
|
||||
if minor_version >= 7:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
ROOT = Path(__file__).parent
|
||||
PACKAGE_ROOT = ROOT / "spacy"
|
||||
|
||||
|
||||
# Preserve `__doc__` on functions and classes
|
||||
# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
|
||||
Options.docstrings = True
|
||||
|
||||
PACKAGES = find_packages()
|
||||
|
||||
|
||||
MOD_NAMES = [
|
||||
"spacy.gold.example",
|
||||
"spacy.parts_of_speech",
|
||||
"spacy.strings",
|
||||
"spacy.lexeme",
|
||||
|
@ -45,11 +39,10 @@ MOD_NAMES = [
|
|||
"spacy.tokenizer",
|
||||
"spacy.syntax.nn_parser",
|
||||
"spacy.syntax._parser_model",
|
||||
"spacy.syntax._beam_utils",
|
||||
"spacy.syntax.nonproj",
|
||||
"spacy.syntax.transition_system",
|
||||
"spacy.syntax.arc_eager",
|
||||
"spacy.gold",
|
||||
"spacy.gold.gold_io",
|
||||
"spacy.tokens.doc",
|
||||
"spacy.tokens.span",
|
||||
"spacy.tokens.token",
|
||||
|
@ -62,16 +55,37 @@ MOD_NAMES = [
|
|||
"spacy.symbols",
|
||||
"spacy.vectors",
|
||||
]
|
||||
|
||||
|
||||
COMPILE_OPTIONS = {
|
||||
"msvc": ["/Ox", "/EHsc"],
|
||||
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||
}
|
||||
|
||||
|
||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
||||
COMPILER_DIRECTIVES = {
|
||||
"language_level": -3,
|
||||
"embedsignature": True,
|
||||
"annotation_typing": False,
|
||||
}
|
||||
# Files to copy into the package that are otherwise not included
|
||||
COPY_FILES = {
|
||||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||
}
|
||||
|
||||
|
||||
def is_new_osx():
|
||||
"""Check whether we're on OSX >= 10.7"""
|
||||
if sys.platform != "darwin":
|
||||
return False
|
||||
mac_ver = platform.mac_ver()[0]
|
||||
if mac_ver.startswith("10"):
|
||||
minor_version = int(mac_ver.split(".")[1])
|
||||
if minor_version >= 7:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
if is_new_osx():
|
||||
|
@ -104,20 +118,6 @@ class build_ext_subclass(build_ext, build_ext_options):
|
|||
build_ext.build_extensions(self)
|
||||
|
||||
|
||||
def generate_cython(root, source):
|
||||
print("Cythonizing sources")
|
||||
p = subprocess.call(
|
||||
[sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
|
||||
env=os.environ,
|
||||
)
|
||||
if p != 0:
|
||||
raise RuntimeError("Running cythonize failed")
|
||||
|
||||
|
||||
def is_source_release(path):
|
||||
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
||||
|
||||
|
||||
# Include the git version in the build (adapted from NumPy)
|
||||
# Copyright (c) 2005-2020, NumPy Developers.
|
||||
# BSD 3-Clause license, see licenses/3rd_party_licenses.txt
|
||||
|
@ -137,19 +137,19 @@ def write_git_info_py(filename="spacy/git_info.py"):
|
|||
return out
|
||||
|
||||
git_version = "Unknown"
|
||||
if os.path.exists(".git"):
|
||||
if Path(".git").exists():
|
||||
try:
|
||||
out = _minimal_ext_cmd(["git", "rev-parse", "--short", "HEAD"])
|
||||
git_version = out.strip().decode("ascii")
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
elif os.path.exists(filename):
|
||||
elif Path(filename).exists():
|
||||
# must be a source distribution, use existing version file
|
||||
try:
|
||||
a = open(filename, "r")
|
||||
lines = a.readlines()
|
||||
git_version = lines[-1].split('"')[1]
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
a.close()
|
||||
|
@ -160,90 +160,59 @@ GIT_VERSION = "%(git_version)s"
|
|||
"""
|
||||
a = open(filename, "w")
|
||||
try:
|
||||
a.write(
|
||||
text % {"git_version": git_version,}
|
||||
)
|
||||
a.write(text % {"git_version": git_version})
|
||||
finally:
|
||||
a.close()
|
||||
|
||||
|
||||
def clean(path):
|
||||
for name in MOD_NAMES:
|
||||
name = name.replace(".", "/")
|
||||
for ext in [".so", ".html", ".cpp", ".c"]:
|
||||
file_path = os.path.join(path, name + ext)
|
||||
if os.path.exists(file_path):
|
||||
os.unlink(file_path)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def chdir(new_dir):
|
||||
old_dir = os.getcwd()
|
||||
try:
|
||||
os.chdir(new_dir)
|
||||
sys.path.insert(0, new_dir)
|
||||
yield
|
||||
finally:
|
||||
del sys.path[0]
|
||||
os.chdir(old_dir)
|
||||
for path in path.glob("**/*"):
|
||||
if path.is_file() and path.suffix in (".so", ".cpp", ".html"):
|
||||
print(f"Deleting {path.name}")
|
||||
path.unlink()
|
||||
|
||||
|
||||
def setup_package():
|
||||
write_git_info_py()
|
||||
|
||||
root = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "clean":
|
||||
return clean(root)
|
||||
return clean(PACKAGE_ROOT)
|
||||
|
||||
with chdir(root):
|
||||
with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
|
||||
about = {}
|
||||
exec(f.read(), about)
|
||||
with (PACKAGE_ROOT / "about.py").open("r") as f:
|
||||
about = {}
|
||||
exec(f.read(), about)
|
||||
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
os.path.join(root, "include"),
|
||||
]
|
||||
for copy_file, target_dir in COPY_FILES.items():
|
||||
if copy_file.exists():
|
||||
shutil.copy(str(copy_file), str(target_dir))
|
||||
print(f"Copied {copy_file} -> {target_dir}")
|
||||
|
||||
if (
|
||||
ccompiler.new_compiler().compiler_type == "msvc"
|
||||
and msvccompiler.get_build_version() == 9
|
||||
):
|
||||
include_dirs.append(os.path.join(root, "include", "msvc9"))
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
numpy.get_include(),
|
||||
str(ROOT / "include"),
|
||||
]
|
||||
if (
|
||||
ccompiler.new_compiler().compiler_type == "msvc"
|
||||
and msvccompiler.get_build_version() == 9
|
||||
):
|
||||
include_dirs.append(str(ROOT / "include" / "msvc9"))
|
||||
ext_modules = []
|
||||
for name in MOD_NAMES:
|
||||
mod_path = name.replace(".", "/") + ".pyx"
|
||||
ext = Extension(name, [mod_path], language="c++")
|
||||
ext_modules.append(ext)
|
||||
print("Cythonizing sources")
|
||||
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
||||
|
||||
ext_modules = []
|
||||
for mod_name in MOD_NAMES:
|
||||
mod_path = mod_name.replace(".", "/") + ".cpp"
|
||||
extra_link_args = []
|
||||
# ???
|
||||
# Imported from patch from @mikepb
|
||||
# See Issue #267. Running blind here...
|
||||
if sys.platform == "darwin":
|
||||
dylib_path = [".." for _ in range(mod_name.count("."))]
|
||||
dylib_path = "/".join(dylib_path)
|
||||
dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
|
||||
extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
|
||||
ext_modules.append(
|
||||
Extension(
|
||||
mod_name,
|
||||
[mod_path],
|
||||
language="c++",
|
||||
include_dirs=include_dirs,
|
||||
extra_link_args=extra_link_args,
|
||||
)
|
||||
)
|
||||
|
||||
if not is_source_release(root):
|
||||
generate_cython(root, "spacy")
|
||||
|
||||
setup(
|
||||
name="spacy",
|
||||
packages=PACKAGES,
|
||||
version=about["__version__"],
|
||||
ext_modules=ext_modules,
|
||||
cmdclass={"build_ext": build_ext_subclass},
|
||||
)
|
||||
setup(
|
||||
name="spacy-nightly",
|
||||
packages=PACKAGES,
|
||||
version=about["__version__"],
|
||||
ext_modules=ext_modules,
|
||||
cmdclass={"build_ext": build_ext_subclass},
|
||||
include_dirs=include_dirs,
|
||||
package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import warnings
|
||||
import sys
|
||||
|
||||
|
@ -7,10 +5,10 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
|||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.neural.util import prefer_gpu, require_gpu
|
||||
from thinc.api import prefer_gpu, require_gpu
|
||||
|
||||
from . import pipeline
|
||||
from .cli.info import info as cli_info
|
||||
from .cli.info import info
|
||||
from .glossary import explain
|
||||
from .about import __version__
|
||||
from .errors import Errors, Warnings
|
||||
|
@ -23,17 +21,13 @@ if sys.maxunicode == 65535:
|
|||
raise SystemError(Errors.E130)
|
||||
|
||||
|
||||
config = registry
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
depr_path = overrides.get("path")
|
||||
if depr_path not in (True, False, None):
|
||||
warnings.warn(Warnings.W001.format(path=depr_path), DeprecationWarning)
|
||||
return util.load_model(name, **overrides)
|
||||
|
||||
|
||||
def blank(name, **kwargs):
|
||||
LangClass = util.get_lang_class(name)
|
||||
return LangClass(**kwargs)
|
||||
|
||||
|
||||
def info(model=None, markdown=False, silent=False):
|
||||
return cli_info(model, markdown, silent)
|
||||
|
|
|
@ -1,36 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
# from __future__ import unicode_literals
|
||||
|
||||
if __name__ == "__main__":
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import msg
|
||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||
from spacy.cli import init_model, profile, evaluate, validate, debug_data
|
||||
from spacy.cli import setup_cli
|
||||
|
||||
commands = {
|
||||
"download": download,
|
||||
"link": link,
|
||||
"info": info,
|
||||
"train": train,
|
||||
"pretrain": pretrain,
|
||||
"debug-data": debug_data,
|
||||
"evaluate": evaluate,
|
||||
"convert": convert,
|
||||
"package": package,
|
||||
"init-model": init_model,
|
||||
"profile": profile,
|
||||
"validate": validate,
|
||||
}
|
||||
if len(sys.argv) == 1:
|
||||
msg.info("Available commands", ", ".join(commands), exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
sys.argv[0] = "spacy %s" % command
|
||||
if command in commands:
|
||||
plac.call(commands[command], sys.argv[1:])
|
||||
else:
|
||||
available = "Available: {}".format(", ".join(commands))
|
||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||
setup_cli()
|
||||
|
|
1004
spacy/_ml.py
1004
spacy/_ml.py
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,7 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "2.3.2"
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a4"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__shortcuts__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json"
|
||||
__projects__ = "https://github.com/explosion/spacy-boilerplates"
|
||||
|
|
|
@ -91,6 +91,7 @@ cdef enum attr_id_t:
|
|||
|
||||
LANG
|
||||
ENT_KB_ID = symbols.ENT_KB_ID
|
||||
MORPH
|
||||
ENT_ID = symbols.ENT_ID
|
||||
|
||||
IDX
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
|
@ -92,6 +89,7 @@ IDS = {
|
|||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
"LANG": LANG,
|
||||
"MORPH": MORPH,
|
||||
"IDX": IDX
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,32 @@
|
|||
from wasabi import msg
|
||||
|
||||
from ._util import app, setup_cli # noqa: F401
|
||||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .link import link # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train import train # noqa: F401
|
||||
from .train import train_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
def link(*args, **kwargs):
|
||||
"""As of spaCy v3.0, model symlinks are deprecated. You can load models
|
||||
using their full names or from a directory path."""
|
||||
msg.warn(
|
||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
||||
"using their full names or from a directory path."
|
||||
)
|
||||
|
|
|
@ -1,220 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# NB: This schema describes the new format of the training data, see #2928
|
||||
TRAINING_SCHEMA = {
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"title": "Training data for spaCy models",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"title": "The text of the training example",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"ents": {
|
||||
"title": "Named entity spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"label": {
|
||||
"title": "Entity label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Z0-9]*$",
|
||||
},
|
||||
},
|
||||
"required": ["start", "end", "label"],
|
||||
},
|
||||
},
|
||||
"sents": {
|
||||
"title": "Sentence spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
"required": ["start", "end"],
|
||||
},
|
||||
},
|
||||
"cats": {
|
||||
"title": "Text categories for the text classifier",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"title": "A text category",
|
||||
"oneOf": [
|
||||
{"type": "boolean"},
|
||||
{"type": "number", "minimum": 0},
|
||||
],
|
||||
}
|
||||
},
|
||||
"propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
|
||||
},
|
||||
"tokens": {
|
||||
"title": "The tokens in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"minProperties": 1,
|
||||
"properties": {
|
||||
"id": {
|
||||
"title": "Token ID, usually token index",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"start": {
|
||||
"title": "Start character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"pos": {
|
||||
"title": "Coarse-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"tag": {
|
||||
"title": "Fine-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"dep": {
|
||||
"title": "Dependency label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"head": {
|
||||
"title": "Index of the token's head",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
"required": ["start", "end"],
|
||||
},
|
||||
},
|
||||
"_": {"title": "Custom user space", "type": "object"},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
}
|
||||
|
||||
META_SCHEMA = {
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lang": {
|
||||
"title": "Two-letter language code, e.g. 'en'",
|
||||
"type": "string",
|
||||
"minLength": 2,
|
||||
"maxLength": 2,
|
||||
"pattern": "^[a-z]*$",
|
||||
},
|
||||
"name": {
|
||||
"title": "Model name",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[a-z_]*$",
|
||||
},
|
||||
"version": {
|
||||
"title": "Model version",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-]*$",
|
||||
},
|
||||
"spacy_version": {
|
||||
"title": "Compatible spaCy version identifier",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-><=]*$",
|
||||
},
|
||||
"parent_package": {
|
||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"default": "spacy",
|
||||
},
|
||||
"pipeline": {
|
||||
"title": "Names of pipeline components",
|
||||
"type": "array",
|
||||
"items": {"type": "string", "minLength": 1},
|
||||
},
|
||||
"description": {"title": "Model description", "type": "string"},
|
||||
"license": {"title": "Model license", "type": "string"},
|
||||
"author": {"title": "Model author name", "type": "string"},
|
||||
"email": {"title": "Model author email", "type": "string", "format": "email"},
|
||||
"url": {"title": "Model author URL", "type": "string", "format": "uri"},
|
||||
"sources": {
|
||||
"title": "Training data sources",
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Included word vectors",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"title": "Number of unique keys",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Number of unique vectors",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"width": {
|
||||
"title": "Number of dimensions",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
"accuracy": {
|
||||
"title": "Accuracy numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {"*": {"type": "number", "minimum": 0.0}},
|
||||
},
|
||||
"speed": {
|
||||
"title": "Speed evaluation numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"oneOf": [
|
||||
{"type": "number", "minimum": 0.0},
|
||||
{"type": "integer", "minimum": 0},
|
||||
]
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["lang", "name", "version"],
|
||||
}
|
195
spacy/cli/_util.py
Normal file
195
spacy/cli/_util.py
Normal file
|
@ -0,0 +1,195 @@
|
|||
from typing import Dict, Any, Union, List, Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
import hashlib
|
||||
import typer
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.config import ConfigValidationError
|
||||
from configparser import InterpolationError
|
||||
import sys
|
||||
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file
|
||||
|
||||
|
||||
PROJECT_FILE = "project.yml"
|
||||
PROJECT_LOCK = "project.lock"
|
||||
COMMAND = "python -m spacy"
|
||||
NAME = "spacy"
|
||||
HELP = """spaCy Command-line Interface
|
||||
|
||||
DOCS: https://spacy.io/api/cli
|
||||
"""
|
||||
PROJECT_HELP = f"""Command-line interface for spaCy projects and templates.
|
||||
You'd typically start by cloning a project template to a local directory and
|
||||
fetching its assets like datasets etc. See the project's {PROJECT_FILE} for the
|
||||
available commands.
|
||||
"""
|
||||
DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
|
||||
commands to check and validate your config files, training and evaluation data,
|
||||
and custom model implementations.
|
||||
"""
|
||||
|
||||
# Wrappers for Typer's annotations. Initially created to set defaults and to
|
||||
# keep the names short, but not needed at the moment.
|
||||
Arg = typer.Argument
|
||||
Opt = typer.Option
|
||||
|
||||
app = typer.Typer(name=NAME, help=HELP)
|
||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||
|
||||
app.add_typer(project_cli)
|
||||
app.add_typer(debug_cli)
|
||||
|
||||
|
||||
def setup_cli() -> None:
|
||||
# Ensure that the help messages always display the correct prompt
|
||||
command = get_command(app)
|
||||
command(prog_name=COMMAND)
|
||||
|
||||
|
||||
def parse_config_overrides(args: List[str]) -> Dict[str, Any]:
|
||||
"""Generate a dictionary of config overrides based on the extra arguments
|
||||
provided on the CLI, e.g. --training.batch_size to override
|
||||
"training.batch_size". Arguments without a "." are considered invalid,
|
||||
since the config only allows top-level sections to exist.
|
||||
|
||||
args (List[str]): The extra arguments from the command line.
|
||||
RETURNS (Dict[str, Any]): The parsed dict, keyed by nested config setting.
|
||||
"""
|
||||
result = {}
|
||||
while args:
|
||||
opt = args.pop(0)
|
||||
err = f"Invalid config override '{opt}'"
|
||||
if opt.startswith("--"): # new argument
|
||||
opt = opt.replace("--", "").replace("-", "_")
|
||||
if "." not in opt:
|
||||
msg.fail(f"{err}: can't override top-level section", exits=1)
|
||||
if not args or args[0].startswith("--"): # flag with no value
|
||||
value = "true"
|
||||
else:
|
||||
value = args.pop(0)
|
||||
# Just like we do in the config, we're calling json.loads on the
|
||||
# values. But since they come from the CLI, it'd b unintuitive to
|
||||
# explicitly mark strings with escaped quotes. So we're working
|
||||
# around that here by falling back to a string if parsing fails.
|
||||
# TODO: improve logic to handle simple types like list of strings?
|
||||
try:
|
||||
result[opt] = srsly.json_loads(value)
|
||||
except ValueError:
|
||||
result[opt] = str(value)
|
||||
else:
|
||||
msg.fail(f"{err}: options need to start with --", exits=1)
|
||||
return result
|
||||
|
||||
|
||||
def load_project_config(path: Path) -> Dict[str, Any]:
|
||||
"""Load the project.yml file from a directory and validate it. Also make
|
||||
sure that all directories defined in the config exist.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
||||
"""
|
||||
config_path = path / PROJECT_FILE
|
||||
if not config_path.exists():
|
||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
||||
try:
|
||||
config = srsly.read_yaml(config_path)
|
||||
except ValueError as e:
|
||||
msg.fail(invalid_err, e, exits=1)
|
||||
errors = validate(ProjectConfigSchema, config)
|
||||
if errors:
|
||||
msg.fail(invalid_err, "\n".join(errors), exits=1)
|
||||
validate_project_commands(config)
|
||||
# Make sure directories defined in config exist
|
||||
for subdir in config.get("directories", []):
|
||||
dir_path = path / subdir
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir(parents=True)
|
||||
return config
|
||||
|
||||
|
||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
||||
"""Check that project commands and workflows are valid, don't contain
|
||||
duplicates, don't clash and only refer to commands that exist.
|
||||
|
||||
config (Dict[str, Any]): The loaded config.
|
||||
"""
|
||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
||||
workflows = config.get("workflows", {})
|
||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
||||
if duplicates:
|
||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
||||
msg.fail(err, exits=1)
|
||||
for workflow_name, workflow_steps in workflows.items():
|
||||
if workflow_name in command_names:
|
||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
||||
msg.fail(err, exits=1)
|
||||
for step in workflow_steps:
|
||||
if step not in command_names:
|
||||
msg.fail(
|
||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
||||
f"Workflows can only refer to commands defined in the 'commands' "
|
||||
f"section of the {PROJECT_FILE}.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def get_hash(data) -> str:
|
||||
"""Get the hash for a JSON-serializable object.
|
||||
|
||||
data: The data to hash.
|
||||
RETURNS (str): The hash.
|
||||
"""
|
||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
||||
return hashlib.md5(data_str).hexdigest()
|
||||
|
||||
|
||||
def get_checksum(path: Union[Path, str]) -> str:
|
||||
"""Get the checksum for a file or directory given its file path. If a
|
||||
directory path is provided, this uses all files in that directory.
|
||||
|
||||
path (Union[Path, str]): The file or directory path.
|
||||
RETURNS (str): The checksum.
|
||||
"""
|
||||
path = Path(path)
|
||||
if path.is_file():
|
||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
||||
if path.is_dir():
|
||||
# TODO: this is currently pretty slow
|
||||
dir_checksum = hashlib.md5()
|
||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
||||
dir_checksum.update(sub_file.read_bytes())
|
||||
return dir_checksum.hexdigest()
|
||||
raise ValueError(f"Can't get checksum for {path}: not a file or directory")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def show_validation_error(title: str = "Config validation error"):
|
||||
"""Helper to show custom config validation errors on the CLI.
|
||||
|
||||
title (str): Title of the custom formatted error.
|
||||
"""
|
||||
try:
|
||||
yield
|
||||
except (ConfigValidationError, InterpolationError) as e:
|
||||
msg.fail(title, spaced=True)
|
||||
print(str(e).replace("Config validation error", "").strip())
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||
"""Helper to import Python file provided in training commands / commands
|
||||
using the config. This makes custom registered functions available.
|
||||
"""
|
||||
if code_path is not None:
|
||||
if not Path(code_path).exists():
|
||||
msg.fail("Path to Python code not found", code_path, exits=1)
|
||||
try:
|
||||
import_file("python_code", code_path)
|
||||
except Exception as e:
|
||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
|
@ -1,132 +1,163 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
import re
|
||||
import sys
|
||||
|
||||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from .converters import ner_jsonl2json
|
||||
from ._util import app, Arg, Opt
|
||||
from ..gold import docs_to_json
|
||||
from ..tokens import DocBin
|
||||
from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
# matched by file extension and content. To add a converter, add a new
|
||||
# entry to this dict with the file extension mapped to the converter function
|
||||
# imported from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
"conllubio": conllu2json,
|
||||
"conllu": conllu2json,
|
||||
"conll": conllu2json,
|
||||
"ner": conll_ner2json,
|
||||
"iob": iob2json,
|
||||
"jsonl": ner_jsonl2json,
|
||||
"conllubio": conllu2docs,
|
||||
"conllu": conllu2docs,
|
||||
"conll": conllu2docs,
|
||||
"ner": conll_ner2docs,
|
||||
"iob": iob2docs,
|
||||
"json": json2docs,
|
||||
}
|
||||
|
||||
# File types
|
||||
FILE_TYPES = ("json", "jsonl", "msg")
|
||||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||
|
||||
# File types that can be written to stdout
|
||||
FILE_TYPES_STDOUT = ("json",)
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
)
|
||||
def convert(
|
||||
input_file,
|
||||
output_dir="-",
|
||||
file_type="json",
|
||||
n_sents=1,
|
||||
seg_sents=False,
|
||||
model=None,
|
||||
morphology=False,
|
||||
converter="auto",
|
||||
lang=None,
|
||||
class FileTypes(str, Enum):
|
||||
json = "json"
|
||||
spacy = "spacy"
|
||||
|
||||
|
||||
@app.command("convert")
|
||||
def convert_cli(
|
||||
# fmt: off
|
||||
input_path: str = Arg(..., help="Input file or directory", exists=True),
|
||||
output_dir: Path = Arg("-", help="Output directory. '-' for stdout.", allow_dash=True, exists=True),
|
||||
file_type: FileTypes = Opt("spacy", "--file-type", "-t", help="Type of data to produce"),
|
||||
n_sents: int = Opt(1, "--n-sents", "-n", help="Number of sentences per doc (0 to disable)"),
|
||||
seg_sents: bool = Opt(False, "--seg-sents", "-s", help="Segment sentences (for -c ner)"),
|
||||
model: Optional[str] = Opt(None, "--model", "-b", help="Model for sentence segmentation (for -s)"),
|
||||
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
|
||||
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
|
||||
converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
|
||||
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
|
||||
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions. If no output_dir is specified, the data
|
||||
Convert files into json or DocBin format for training. The resulting .spacy
|
||||
file can be used with the train command and other experiment management
|
||||
functions.
|
||||
|
||||
If no output_dir is specified and the output format is JSON, the data
|
||||
is written to stdout, so you can pipe them forward to a JSON file:
|
||||
$ spacy convert some_file.conllu > some_file.json
|
||||
$ spacy convert some_file.conllu --file-type json > some_file.json
|
||||
"""
|
||||
no_print = output_dir == "-"
|
||||
msg = Printer(no_print=no_print)
|
||||
input_path = Path(input_file)
|
||||
if file_type not in FILE_TYPES:
|
||||
msg.fail(
|
||||
"Unknown file type: '{}'".format(file_type),
|
||||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
msg.fail(
|
||||
"Can't write .{} data to stdout.".format(file_type),
|
||||
"Please specify an output directory.",
|
||||
exits=1,
|
||||
)
|
||||
if not input_path.exists():
|
||||
msg.fail("Input file not found", input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
msg.fail("Output directory not found", output_dir, exits=1)
|
||||
input_data = input_path.open("r", encoding="utf-8").read()
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
converter_autodetect = autodetect_ner_format(input_data)
|
||||
if converter_autodetect == "ner":
|
||||
msg.info("Auto-detected token-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
elif converter_autodetect == "iob":
|
||||
msg.info("Auto-detected sentence-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
else:
|
||||
msg.warn(
|
||||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
data = func(
|
||||
input_data,
|
||||
if isinstance(file_type, FileTypes):
|
||||
# We get an instance of the FileTypes from the CLI so we need its string value
|
||||
file_type = file_type.value
|
||||
input_path = Path(input_path)
|
||||
output_dir = "-" if output_dir == Path("-") else output_dir
|
||||
cli_args = locals()
|
||||
silent = output_dir == "-"
|
||||
msg = Printer(no_print=silent)
|
||||
verify_cli_args(msg, **cli_args)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
convert(
|
||||
input_path,
|
||||
output_dir,
|
||||
file_type=file_type,
|
||||
n_sents=n_sents,
|
||||
seg_sents=seg_sents,
|
||||
use_morphology=morphology,
|
||||
lang=lang,
|
||||
model=model,
|
||||
no_print=no_print,
|
||||
morphology=morphology,
|
||||
merge_subtokens=merge_subtokens,
|
||||
converter=converter,
|
||||
ner_map=ner_map,
|
||||
lang=lang,
|
||||
silent=silent,
|
||||
msg=msg,
|
||||
)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||
if file_type == "json":
|
||||
srsly.write_json(output_file, data)
|
||||
elif file_type == "jsonl":
|
||||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good(
|
||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
||||
|
||||
|
||||
def convert(
|
||||
input_path: Path,
|
||||
output_dir: Path,
|
||||
*,
|
||||
file_type: str = "json",
|
||||
n_sents: int = 1,
|
||||
seg_sents: bool = False,
|
||||
model: Optional[str] = None,
|
||||
morphology: bool = False,
|
||||
merge_subtokens: bool = False,
|
||||
converter: str = "auto",
|
||||
ner_map: Optional[Path] = None,
|
||||
lang: Optional[str] = None,
|
||||
silent: bool = True,
|
||||
msg: Optional[Path] = None,
|
||||
) -> None:
|
||||
if not msg:
|
||||
msg = Printer(no_print=silent)
|
||||
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
|
||||
|
||||
for input_loc in walk_directory(input_path):
|
||||
input_data = input_loc.open("r", encoding="utf-8").read()
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
docs = func(
|
||||
input_data,
|
||||
n_sents=n_sents,
|
||||
seg_sents=seg_sents,
|
||||
append_morphology=morphology,
|
||||
merge_subtokens=merge_subtokens,
|
||||
lang=lang,
|
||||
model=model,
|
||||
no_print=silent,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
srsly.write_json("-", data)
|
||||
elif file_type == "jsonl":
|
||||
srsly.write_jsonl("-", data)
|
||||
data = [docs_to_json(docs)]
|
||||
else:
|
||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||
if output_dir == "-":
|
||||
_print_docs_to_stdout(data, file_type)
|
||||
else:
|
||||
if input_loc != input_path:
|
||||
subpath = input_loc.relative_to(input_path)
|
||||
output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
|
||||
else:
|
||||
output_file = Path(output_dir) / input_loc.parts[-1]
|
||||
output_file = output_file.with_suffix(f".{file_type}")
|
||||
_write_docs_to_file(data, output_file, file_type)
|
||||
msg.good(f"Generated output file ({len(docs)} documents): {output_file}")
|
||||
|
||||
|
||||
def autodetect_ner_format(input_data):
|
||||
def _print_docs_to_stdout(data, output_type):
|
||||
if output_type == "json":
|
||||
srsly.write_json("-", data)
|
||||
else:
|
||||
sys.stdout.buffer.write(data)
|
||||
|
||||
|
||||
def _write_docs_to_file(data, output_file, output_type):
|
||||
if not output_file.parent.exists():
|
||||
output_file.parent.mkdir(parents=True)
|
||||
if output_type == "json":
|
||||
srsly.write_json(output_file, data)
|
||||
else:
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
|
||||
|
||||
def autodetect_ner_format(input_data: str) -> str:
|
||||
# guess format from the first 20 lines
|
||||
lines = input_data.split("\n")[:20]
|
||||
format_guesses = {"ner": 0, "iob": 0}
|
||||
|
@ -143,3 +174,86 @@ def autodetect_ner_format(input_data):
|
|||
if format_guesses["ner"] == 0 and format_guesses["iob"] > 0:
|
||||
return "iob"
|
||||
return None
|
||||
|
||||
|
||||
def walk_directory(path):
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
else:
|
||||
locs.append(path)
|
||||
return locs
|
||||
|
||||
|
||||
def verify_cli_args(
|
||||
msg,
|
||||
input_path,
|
||||
output_dir,
|
||||
file_type,
|
||||
n_sents,
|
||||
seg_sents,
|
||||
model,
|
||||
morphology,
|
||||
merge_subtokens,
|
||||
converter,
|
||||
ner_map,
|
||||
lang,
|
||||
):
|
||||
input_path = Path(input_path)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
msg.fail(
|
||||
f"Can't write .{file_type} data to stdout",
|
||||
"Please specify an output directory.",
|
||||
exits=1,
|
||||
)
|
||||
if not input_path.exists():
|
||||
msg.fail("Input file not found", input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
msg.fail("Output directory not found", output_dir, exits=1)
|
||||
if input_path.is_dir():
|
||||
input_locs = walk_directory(input_path)
|
||||
if len(input_locs) == 0:
|
||||
msg.fail("No input files in directory", input_path, exits=1)
|
||||
file_types = list(set([loc.suffix[1:] for loc in input_locs]))
|
||||
if len(file_types) >= 2:
|
||||
file_types = ",".join(file_types)
|
||||
msg.fail("All input files must be same type", file_types, exits=1)
|
||||
converter = _get_converter(msg, converter, input_path)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
return converter
|
||||
|
||||
|
||||
def _get_converter(msg, converter, input_path):
|
||||
if input_path.is_dir():
|
||||
input_path = walk_directory(input_path)[0]
|
||||
if converter == "auto":
|
||||
converter = input_path.suffix[1:]
|
||||
if converter == "ner" or converter == "iob":
|
||||
with input_path.open() as file_:
|
||||
input_data = file_.read()
|
||||
converter_autodetect = autodetect_ner_format(input_data)
|
||||
if converter_autodetect == "ner":
|
||||
msg.info("Auto-detected token-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
elif converter_autodetect == "iob":
|
||||
msg.info("Auto-detected sentence-per-line NER format")
|
||||
converter = converter_autodetect
|
||||
else:
|
||||
msg.warn(
|
||||
"Can't automatically detect NER format. "
|
||||
"Conversion may not succeed. "
|
||||
"See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
return converter
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
from .conllu2json import conllu2json # noqa: F401
|
||||
from .iob2json import iob2json # noqa: F401
|
||||
from .conll_ner2json import conll_ner2json # noqa: F401
|
||||
from .jsonl2json import ner_jsonl2json # noqa: F401
|
|
@ -1,141 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
||||
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
|
||||
Extract NER tags if available and convert them so that they follow
|
||||
BILUO and the Wikipedia scheme
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
# by @katarkor
|
||||
docs = []
|
||||
sentences = []
|
||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||
checked_for_ner = False
|
||||
has_ner_tags = False
|
||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||
sentence, brackets = tokens[0]
|
||||
if not checked_for_ner:
|
||||
has_ner_tags = is_ner(sentence[5][0])
|
||||
checked_for_ner = True
|
||||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conluu document
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_doc(sentences, i)
|
||||
docs.append(doc)
|
||||
sentences = []
|
||||
if sentences:
|
||||
doc = create_doc(sentences, i)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
||||
def is_ner(tag):
|
||||
"""
|
||||
Check the 10th column of the first token to determine if the file contains
|
||||
NER tags
|
||||
"""
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
return True
|
||||
elif tag == "O":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def read_conllx(input_data, use_morphology=False, n=0):
|
||||
i = 0
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head not in ["0", "_"] else id_
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
tag = pos if tag == "_" else tag
|
||||
tag = tag + "__" + morph if use_morphology else tag
|
||||
iob = iob if iob else "O"
|
||||
tokens.append((id_, word, tag, head, dep, iob))
|
||||
except: # noqa: E722
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
yield (None, [[tuples, []]])
|
||||
i += 1
|
||||
if n >= 1 and i >= n:
|
||||
break
|
||||
|
||||
|
||||
def simplify_tags(iob):
|
||||
"""
|
||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||
'MISC'.
|
||||
"""
|
||||
new_iob = []
|
||||
for tag in iob:
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
prefix = tag_match.group(1)
|
||||
suffix = tag_match.group(2)
|
||||
if suffix == "GPE_LOC":
|
||||
suffix = "LOC"
|
||||
elif suffix == "GPE_ORG":
|
||||
suffix = "ORG"
|
||||
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||
suffix = "MISC"
|
||||
tag = prefix + "-" + suffix
|
||||
new_iob.append(tag)
|
||||
return new_iob
|
||||
|
||||
|
||||
def generate_sentence(sent, has_ner_tags):
|
||||
(id_, word, tag, head, dep, iob) = sent
|
||||
sentence = {}
|
||||
tokens = []
|
||||
if has_ner_tags:
|
||||
iob = simplify_tags(iob)
|
||||
biluo = iob_to_biluo(iob)
|
||||
for i, id in enumerate(id_):
|
||||
token = {}
|
||||
token["id"] = id
|
||||
token["orth"] = word[i]
|
||||
token["tag"] = tag[i]
|
||||
token["head"] = head[i] - id
|
||||
token["dep"] = dep[i]
|
||||
if has_ner_tags:
|
||||
token["ner"] = biluo[i]
|
||||
tokens.append(token)
|
||||
sentence["tokens"] = tokens
|
||||
return sentence
|
||||
|
||||
|
||||
def create_doc(sentences, id):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id
|
||||
doc["paragraphs"] = []
|
||||
paragraph["sentences"] = sentences
|
||||
doc["paragraphs"].append(paragraph)
|
||||
return doc
|
|
@ -1,68 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
from ...util import minibatch
|
||||
from .conll_ner2json import n_sents_info
|
||||
|
||||
|
||||
def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files with one sentence per line and tags separated with '|'
|
||||
into JSON format for use with train cli. IOB and IOB2 are accepted.
|
||||
|
||||
Sample formats:
|
||||
|
||||
I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
||||
I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
||||
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||
"""
|
||||
msg = Printer(no_print=no_print)
|
||||
docs = read_iob(input_data.split("\n"))
|
||||
if n_sents > 0:
|
||||
n_sents_info(msg, n_sents)
|
||||
docs = merge_sentences(docs, n_sents)
|
||||
return docs
|
||||
|
||||
|
||||
def read_iob(raw_sents):
|
||||
sentences = []
|
||||
for line in raw_sents:
|
||||
if not line.strip():
|
||||
continue
|
||||
tokens = [t.split("|") for t in line.split()]
|
||||
if len(tokens[0]) == 3:
|
||||
words, pos, iob = zip(*tokens)
|
||||
elif len(tokens[0]) == 2:
|
||||
words, iob = zip(*tokens)
|
||||
pos = ["-"] * len(words)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
biluo = iob_to_biluo(iob)
|
||||
sentences.append(
|
||||
[
|
||||
{"orth": w, "tag": p, "ner": ent}
|
||||
for (w, p, ent) in zip(words, pos, biluo)
|
||||
]
|
||||
)
|
||||
sentences = [{"tokens": sent} for sent in sentences]
|
||||
paragraphs = [{"sentences": [sent]} for sent in sentences]
|
||||
docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)]
|
||||
return docs
|
||||
|
||||
|
||||
def merge_sentences(docs, n_sents):
|
||||
merged = []
|
||||
for group in minibatch(docs, size=n_sents):
|
||||
group = list(group)
|
||||
first = group.pop(0)
|
||||
to_extend = first["paragraphs"][0]["sentences"]
|
||||
for sent in group:
|
||||
to_extend.extend(sent["paragraphs"][0]["sentences"])
|
||||
merged.append(first)
|
||||
return merged
|
|
@ -1,53 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import srsly
|
||||
|
||||
from ...gold import docs_to_json
|
||||
from ...util import get_lang_class, minibatch
|
||||
|
||||
|
||||
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
|
||||
if lang is None:
|
||||
raise ValueError("No --lang specified, but tokenization required")
|
||||
json_docs = []
|
||||
input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
|
||||
nlp = get_lang_class(lang)()
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
|
||||
docs = []
|
||||
for record in batch:
|
||||
raw_text = record["text"]
|
||||
if "entities" in record:
|
||||
ents = record["entities"]
|
||||
else:
|
||||
ents = record["spans"]
|
||||
ents = [(e["start"], e["end"], e["label"]) for e in ents]
|
||||
doc = nlp.make_doc(raw_text)
|
||||
sentencizer(doc)
|
||||
spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
|
||||
doc.ents = _cleanup_spans(spans)
|
||||
docs.append(doc)
|
||||
json_docs.append(docs_to_json(docs, id=i))
|
||||
return json_docs
|
||||
|
||||
|
||||
def _cleanup_spans(spans):
|
||||
output = []
|
||||
seen = set()
|
||||
for span in spans:
|
||||
if span is not None:
|
||||
# Trim whitespace
|
||||
while len(span) and span[0].is_space:
|
||||
span = span[1:]
|
||||
while len(span) and span[-1].is_space:
|
||||
span = span[:-1]
|
||||
if not len(span):
|
||||
continue
|
||||
for i in range(span.start, span.end):
|
||||
if i in seen:
|
||||
break
|
||||
else:
|
||||
output.append(span)
|
||||
seen.update(range(span.start, span.end))
|
||||
return output
|
|
@ -1,16 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from typing import List, Sequence, Dict, Any, Tuple, Optional
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import plac
|
||||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
|
||||
from ..gold import GoldCorpus
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from ..schemas import ConfigSchema
|
||||
from ..gold import Corpus, Example
|
||||
from ..syntax import nonproj
|
||||
from ..util import load_model, get_lang_class
|
||||
from ..language import Language
|
||||
from .. import util
|
||||
|
||||
|
||||
# Minimum number of expected occurrences of NER label in data to train new label
|
||||
|
@ -22,83 +24,137 @@ BLANK_MODEL_MIN_THRESHOLD = 100
|
|||
BLANK_MODEL_THRESHOLD = 2000
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
lang=("model language", "positional", None, str),
|
||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||
base_model=("name of model to update (optional)", "option", "b", str),
|
||||
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||
# fmt: on
|
||||
@debug_cli.command(
|
||||
"config",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def debug_data(
|
||||
lang,
|
||||
train_path,
|
||||
dev_path,
|
||||
tag_map_path=None,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
ignore_warnings=False,
|
||||
verbose=False,
|
||||
no_format=False,
|
||||
def debug_config_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Debug a config.cfg file and show validation errors. The command will
|
||||
create all objects in the tree and validate them. Note that some config
|
||||
validation errors are blocking and will prevent the rest of the config from
|
||||
being resolved. This means that you may not see all validation errors at
|
||||
once and some issues are only shown once previous errors have been fixed.
|
||||
"""
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
with show_validation_error():
|
||||
util.load_config(
|
||||
config_path, create_objects=False, schema=ConfigSchema, overrides=overrides,
|
||||
)
|
||||
msg.good("Config is valid")
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
@app.command(
|
||||
"debug-data",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
hidden=True, # hide this from main CLI help but still allow it to work with warning
|
||||
)
|
||||
def debug_data_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
train_path: Path = Arg(..., help="Location of JSON-formatted training data", exists=True),
|
||||
dev_path: Path = Arg(..., help="Location of JSON-formatted development data", exists=True),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
|
||||
no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Analyze, debug and validate your training and development data, get useful
|
||||
stats, and find problems like invalid entity annotations, cyclic
|
||||
dependencies, low data labels and more.
|
||||
Analyze, debug and validate your training and development data. Outputs
|
||||
useful stats, and can help you find problems like invalid entity annotations,
|
||||
cyclic dependencies, low data labels and more.
|
||||
"""
|
||||
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
|
||||
if ctx.command.name == "debug-data":
|
||||
msg.warn(
|
||||
"The debug-data command is now available via the 'debug data' "
|
||||
"subcommand (without the hyphen). You can run python -m spacy debug "
|
||||
"--help for an overview of the other available debugging commands."
|
||||
)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
debug_data(
|
||||
train_path,
|
||||
dev_path,
|
||||
config_path,
|
||||
config_overrides=overrides,
|
||||
ignore_warnings=ignore_warnings,
|
||||
verbose=verbose,
|
||||
no_format=no_format,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def debug_data(
|
||||
train_path: Path,
|
||||
dev_path: Path,
|
||||
config_path: Path,
|
||||
*,
|
||||
config_overrides: Dict[str, Any] = {},
|
||||
ignore_warnings: bool = False,
|
||||
verbose: bool = False,
|
||||
no_format: bool = True,
|
||||
silent: bool = True,
|
||||
):
|
||||
msg = Printer(
|
||||
no_print=silent, pretty=not no_format, ignore_warnings=ignore_warnings
|
||||
)
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not train_path.exists():
|
||||
msg.fail("Training data not found", train_path, exits=1)
|
||||
if not dev_path.exists():
|
||||
msg.fail("Development data not found", dev_path, exits=1)
|
||||
|
||||
if not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exists=1)
|
||||
with show_validation_error():
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
create_objects=False,
|
||||
schema=ConfigSchema,
|
||||
overrides=config_overrides,
|
||||
)
|
||||
nlp = util.load_model_from_config(config["nlp"])
|
||||
lang = config["nlp"]["lang"]
|
||||
base_model = config["nlp"]["base_model"]
|
||||
pipeline = list(config["nlp"]["pipeline"].keys())
|
||||
tag_map_path = util.ensure_path(config["training"]["tag_map"])
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
tag_map = srsly.read_json(tag_map_path)
|
||||
|
||||
# Initialize the model and pipeline
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
if base_model:
|
||||
nlp = load_model(base_model)
|
||||
else:
|
||||
lang_cls = get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
morph_rules_path = util.ensure_path(config["training"]["morph_rules"])
|
||||
morph_rules = {}
|
||||
if morph_rules_path is not None:
|
||||
morph_rules = srsly.read_json(morph_rules_path)
|
||||
# Replace tag map with provided mapping
|
||||
nlp.vocab.morphology.load_tag_map(tag_map)
|
||||
# Load morph rules
|
||||
nlp.vocab.morphology.load_morph_exceptions(morph_rules)
|
||||
|
||||
msg.divider("Data format validation")
|
||||
|
||||
# TODO: Validate data format using the JSON schema
|
||||
# TODO: update once the new format is ready
|
||||
# TODO: move validation to GoldCorpus in order to be able to load from dir
|
||||
msg.divider("Data file validation")
|
||||
|
||||
# Create the gold corpus to be able to better analyze data
|
||||
loading_train_error_message = ""
|
||||
loading_dev_error_message = ""
|
||||
with msg.loading("Loading corpus..."):
|
||||
corpus = GoldCorpus(train_path, dev_path)
|
||||
corpus = Corpus(train_path, dev_path)
|
||||
try:
|
||||
train_docs = list(corpus.train_docs(nlp))
|
||||
train_docs_unpreprocessed = list(
|
||||
corpus.train_docs_without_preprocessing(nlp)
|
||||
)
|
||||
train_dataset = list(corpus.train_dataset(nlp))
|
||||
except ValueError as e:
|
||||
loading_train_error_message = "Training data cannot be loaded: {}".format(
|
||||
str(e)
|
||||
)
|
||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
||||
try:
|
||||
dev_docs = list(corpus.dev_docs(nlp))
|
||||
dev_dataset = list(corpus.dev_dataset(nlp))
|
||||
except ValueError as e:
|
||||
loading_dev_error_message = "Development data cannot be loaded: {}".format(
|
||||
str(e)
|
||||
)
|
||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
||||
if loading_train_error_message or loading_dev_error_message:
|
||||
if loading_train_error_message:
|
||||
msg.fail(loading_train_error_message)
|
||||
|
@ -107,82 +163,68 @@ def debug_data(
|
|||
sys.exit(1)
|
||||
msg.good("Corpus is loadable")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
|
||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
|
||||
gold_train_unpreprocessed_data = _compile_gold(
|
||||
train_docs_unpreprocessed, pipeline, nlp
|
||||
train_dataset, pipeline, nlp, make_proj=False
|
||||
)
|
||||
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
|
||||
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
|
||||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||
msg.fail(f"Pipeline component '{pipe}' not available in factories")
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
msg.text(f"Starting with base model '{base_model}'")
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
msg.text("{} training docs".format(len(train_docs)))
|
||||
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||
msg.text(f"Starting with blank model '{lang}'")
|
||||
msg.text(f"{len(train_dataset)} training docs")
|
||||
msg.text(f"{len(dev_dataset)} evaluation docs")
|
||||
|
||||
if not len(dev_docs):
|
||||
if not len(gold_dev_data):
|
||||
msg.fail("No evaluation docs")
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
msg.warn(f"{overlap} training examples also in evaluation data")
|
||||
else:
|
||||
msg.good("No overlap between training and evaluation data")
|
||||
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
||||
text = "Low number of examples to train from a blank model ({})".format(
|
||||
len(train_docs)
|
||||
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||
text = (
|
||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
||||
)
|
||||
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
msg.fail(text)
|
||||
else:
|
||||
msg.warn(text)
|
||||
msg.text(
|
||||
"It's recommended to use at least {} examples (minimum {})".format(
|
||||
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||
),
|
||||
f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
|
||||
f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
msg.divider("Vocab & Vectors")
|
||||
n_words = gold_train_data["n_words"]
|
||||
msg.info(
|
||||
"{} total {} in the data ({} unique)".format(
|
||||
n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
|
||||
)
|
||||
f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
|
||||
)
|
||||
if gold_train_data["n_misaligned_words"] > 0:
|
||||
msg.warn(
|
||||
"{} misaligned tokens in the training data".format(
|
||||
gold_train_data["n_misaligned_words"]
|
||||
)
|
||||
)
|
||||
n_misaligned = gold_train_data["n_misaligned_words"]
|
||||
msg.warn(f"{n_misaligned} misaligned tokens in the training data")
|
||||
if gold_dev_data["n_misaligned_words"] > 0:
|
||||
msg.warn(
|
||||
"{} misaligned tokens in the dev data".format(
|
||||
gold_dev_data["n_misaligned_words"]
|
||||
)
|
||||
)
|
||||
n_misaligned = gold_dev_data["n_misaligned_words"]
|
||||
msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
|
||||
most_common_words = gold_train_data["words"].most_common(10)
|
||||
msg.text(
|
||||
"10 most common words: {}".format(
|
||||
_format_labels(most_common_words, counts=True)
|
||||
),
|
||||
f"10 most common words: {_format_labels(most_common_words, counts=True)}",
|
||||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
msg.info(
|
||||
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||
len(nlp.vocab.vectors),
|
||||
nlp.vocab.vectors.n_keys,
|
||||
nlp.vocab.vectors_length,
|
||||
)
|
||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||
)
|
||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||
msg.warn(
|
||||
|
@ -205,7 +247,7 @@ def debug_data(
|
|||
if "ner" in pipeline:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-")
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
||||
)
|
||||
label_counts = gold_train_data["ner"]
|
||||
model_labels = _get_labels_from_model(nlp, "ner")
|
||||
|
@ -218,19 +260,10 @@ def debug_data(
|
|||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(
|
||||
"{} new {}, {} existing {}".format(
|
||||
len(new_labels),
|
||||
"label" if len(new_labels) == 1 else "labels",
|
||||
len(existing_labels),
|
||||
"label" if len(existing_labels) == 1 else "labels",
|
||||
)
|
||||
f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
|
||||
)
|
||||
missing_values = label_counts["-"]
|
||||
msg.text(
|
||||
"{} missing {} (tokens with '-' label)".format(
|
||||
missing_values, "value" if missing_values == 1 else "values"
|
||||
)
|
||||
)
|
||||
msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
|
||||
for label in new_labels:
|
||||
if len(label) == 0:
|
||||
msg.fail("Empty label found in new labels")
|
||||
|
@ -241,43 +274,28 @@ def debug_data(
|
|||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||
if gold_train_data["ws_ents"]:
|
||||
msg.fail(
|
||||
"{} invalid whitespace entity span(s)".format(
|
||||
gold_train_data["ws_ents"]
|
||||
)
|
||||
)
|
||||
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
|
||||
has_ws_ents_error = True
|
||||
|
||||
if gold_train_data["punct_ents"]:
|
||||
msg.warn(
|
||||
"{} entity span(s) with punctuation".format(
|
||||
gold_train_data["punct_ents"]
|
||||
)
|
||||
)
|
||||
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
|
||||
has_punct_ents_warning = True
|
||||
|
||||
for label in new_labels:
|
||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for new label '{}' ({})".format(
|
||||
label, label_counts[label]
|
||||
)
|
||||
f"Low number of examples for new label '{label}' ({label_counts[label]})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_docs, label)
|
||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||
if neg_docs == 0:
|
||||
msg.warn(
|
||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||
)
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
if not has_low_data_warning:
|
||||
|
@ -291,8 +309,8 @@ def debug_data(
|
|||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a new entity type, your data should include at "
|
||||
"least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||
f"To train a new entity type, your data should include at "
|
||||
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||
show=verbose,
|
||||
)
|
||||
if has_no_neg_warning:
|
||||
|
@ -321,27 +339,21 @@ def debug_data(
|
|||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
msg.info(
|
||||
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||
len(new_labels), len(existing_labels)
|
||||
)
|
||||
f"Text Classification: {len(new_labels)} new label(s), "
|
||||
f"{len(existing_labels)} existing label(s)"
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["cats"].most_common(), counts=True
|
||||
)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
||||
msg.fail(
|
||||
"The train and dev labels are not the same. "
|
||||
"Train labels: {}. "
|
||||
"Dev labels: {}.".format(
|
||||
_format_labels(gold_train_data["cats"]),
|
||||
_format_labels(gold_dev_data["cats"]),
|
||||
)
|
||||
f"The train and dev labels are not the same. "
|
||||
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
||||
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
||||
)
|
||||
if gold_train_data["n_cats_multilabel"] > 0:
|
||||
msg.info(
|
||||
|
@ -371,27 +383,16 @@ def debug_data(
|
|||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_train_data["tags"]]
|
||||
tag_map = nlp.vocab.morphology.tag_map
|
||||
msg.info(
|
||||
"{} {} in data ({} {} in tag map)".format(
|
||||
len(labels),
|
||||
"label" if len(labels) == 1 else "labels",
|
||||
len(tag_map),
|
||||
"label" if len(tag_map) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["tags"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
non_tagmap = [l for l in labels if l not in tag_map]
|
||||
if not non_tagmap:
|
||||
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
||||
for label in non_tagmap:
|
||||
msg.fail(
|
||||
"Label '{}' not found in tag map for language '{}'".format(
|
||||
label, nlp.lang
|
||||
)
|
||||
)
|
||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||
|
||||
if "parser" in pipeline:
|
||||
has_low_data_warning = False
|
||||
|
@ -399,21 +400,18 @@ def debug_data(
|
|||
|
||||
# profile sentence length
|
||||
msg.info(
|
||||
"Found {} sentence{} with an average length of {:.1f} words.".format(
|
||||
gold_train_data["n_sents"],
|
||||
"s" if len(train_docs) > 1 else "",
|
||||
gold_train_data["n_words"] / gold_train_data["n_sents"],
|
||||
)
|
||||
f"Found {gold_train_data['n_sents']} sentence(s) with an average "
|
||||
f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
|
||||
)
|
||||
|
||||
# check for documents with multiple sentences
|
||||
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
|
||||
if sents_per_doc < 1.1:
|
||||
msg.warn(
|
||||
"The training data contains {:.2f} sentences per "
|
||||
"document. When there are very few documents containing more "
|
||||
"than one sentence, the parser will not learn how to segment "
|
||||
"longer texts into sentences.".format(sents_per_doc)
|
||||
f"The training data contains {sents_per_doc:.2f} sentences per "
|
||||
f"document. When there are very few documents containing more "
|
||||
f"than one sentence, the parser will not learn how to segment "
|
||||
f"longer texts into sentences."
|
||||
)
|
||||
|
||||
# profile labels
|
||||
|
@ -424,32 +422,13 @@ def debug_data(
|
|||
labels_dev = [label for label in gold_dev_data["deps"]]
|
||||
|
||||
if gold_train_unpreprocessed_data["n_nonproj"] > 0:
|
||||
msg.info(
|
||||
"Found {} nonprojective train sentence{}".format(
|
||||
gold_train_unpreprocessed_data["n_nonproj"],
|
||||
"s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
)
|
||||
n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
|
||||
msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
|
||||
if gold_dev_data["n_nonproj"] > 0:
|
||||
msg.info(
|
||||
"Found {} nonprojective dev sentence{}".format(
|
||||
gold_dev_data["n_nonproj"],
|
||||
"s" if gold_dev_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
)
|
||||
|
||||
msg.info(
|
||||
"{} {} in train data".format(
|
||||
len(labels_train_unpreprocessed),
|
||||
"label" if len(labels_train) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
msg.info(
|
||||
"{} {} in projectivized train data".format(
|
||||
len(labels_train), "label" if len(labels_train) == 1 else "labels"
|
||||
)
|
||||
)
|
||||
|
||||
n_nonproj = gold_dev_data["n_nonproj"]
|
||||
msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
|
||||
msg.info(f"{len(labels_train_unpreprocessed)} label(s) in train data")
|
||||
msg.info(f"{len(labels_train)} label(s) in projectivized train data")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_unpreprocessed_data["deps"].most_common(), counts=True
|
||||
)
|
||||
|
@ -459,9 +438,8 @@ def debug_data(
|
|||
for label in gold_train_unpreprocessed_data["deps"]:
|
||||
if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for label '{}' ({})".format(
|
||||
label, gold_train_unpreprocessed_data["deps"][label]
|
||||
)
|
||||
f"Low number of examples for label '{label}' "
|
||||
f"({gold_train_unpreprocessed_data['deps'][label]})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
|
@ -470,22 +448,19 @@ def debug_data(
|
|||
for label in gold_train_data["deps"]:
|
||||
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
||||
rare_projectivized_labels.append(
|
||||
"{}: {}".format(label, str(gold_train_data["deps"][label]))
|
||||
f"{label}: {gold_train_data['deps'][label]}"
|
||||
)
|
||||
|
||||
if len(rare_projectivized_labels) > 0:
|
||||
msg.warn(
|
||||
"Low number of examples for {} label{} in the "
|
||||
"projectivized dependency trees used for training. You may "
|
||||
"want to projectivize labels such as punct before "
|
||||
"training in order to improve parser performance.".format(
|
||||
len(rare_projectivized_labels),
|
||||
"s" if len(rare_projectivized_labels) > 1 else "",
|
||||
)
|
||||
f"Low number of examples for {len(rare_projectivized_labels)} "
|
||||
"label(s) in the projectivized dependency trees used for "
|
||||
"training. You may want to projectivize labels such as punct "
|
||||
"before training in order to improve parser performance."
|
||||
)
|
||||
msg.warn(
|
||||
"Projectivized labels with low numbers of examples: "
|
||||
"{}".format("\n".join(rare_projectivized_labels)),
|
||||
f"Projectivized labels with low numbers of examples: ",
|
||||
", ".join(rare_projectivized_labels),
|
||||
show=verbose,
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
@ -493,50 +468,44 @@ def debug_data(
|
|||
# labels only in train
|
||||
if set(labels_train) - set(labels_dev):
|
||||
msg.warn(
|
||||
"The following labels were found only in the train data: "
|
||||
"{}".format(", ".join(set(labels_train) - set(labels_dev))),
|
||||
"The following labels were found only in the train data:",
|
||||
", ".join(set(labels_train) - set(labels_dev)),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# labels only in dev
|
||||
if set(labels_dev) - set(labels_train):
|
||||
msg.warn(
|
||||
"The following labels were found only in the dev data: "
|
||||
+ ", ".join(set(labels_dev) - set(labels_train)),
|
||||
"The following labels were found only in the dev data:",
|
||||
", ".join(set(labels_dev) - set(labels_train)),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a parser, your data should include at "
|
||||
"least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
|
||||
f"To train a parser, your data should include at "
|
||||
f"least {DEP_LABEL_THRESHOLD} instances of each label.",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# multiple root labels
|
||||
if len(gold_train_unpreprocessed_data["roots"]) > 1:
|
||||
msg.warn(
|
||||
"Multiple root labels ({}) ".format(
|
||||
", ".join(gold_train_unpreprocessed_data["roots"])
|
||||
)
|
||||
+ "found in training data. spaCy's parser uses a single root "
|
||||
"label ROOT so this distinction will not be available."
|
||||
f"Multiple root labels "
|
||||
f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
|
||||
f"found in training data. spaCy's parser uses a single root "
|
||||
f"label ROOT so this distinction will not be available."
|
||||
)
|
||||
|
||||
# these should not happen, but just in case
|
||||
if gold_train_data["n_nonproj"] > 0:
|
||||
msg.fail(
|
||||
"Found {} nonprojective projectivized train sentence{}".format(
|
||||
gold_train_data["n_nonproj"],
|
||||
"s" if gold_train_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
f"Found {gold_train_data['n_nonproj']} nonprojective "
|
||||
f"projectivized train sentence(s)"
|
||||
)
|
||||
if gold_train_data["n_cycles"] > 0:
|
||||
msg.fail(
|
||||
"Found {} projectivized train sentence{} with cycles".format(
|
||||
gold_train_data["n_cycles"],
|
||||
"s" if gold_train_data["n_cycles"] > 1 else "",
|
||||
)
|
||||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
msg.divider("Summary")
|
||||
|
@ -544,42 +513,36 @@ def debug_data(
|
|||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||
if good_counts:
|
||||
msg.good(
|
||||
"{} {} passed".format(
|
||||
good_counts, "check" if good_counts == 1 else "checks"
|
||||
)
|
||||
)
|
||||
msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
|
||||
if warn_counts:
|
||||
msg.warn(
|
||||
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||
)
|
||||
if fail_counts:
|
||||
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||
|
||||
msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
|
||||
if fail_counts:
|
||||
msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _load_file(file_path, msg):
|
||||
def _load_file(file_path: Path, msg: Printer) -> None:
|
||||
file_name = file_path.parts[-1]
|
||||
if file_path.suffix == ".json":
|
||||
with msg.loading("Loading {}...".format(file_name)):
|
||||
with msg.loading(f"Loading {file_name}..."):
|
||||
data = srsly.read_json(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
msg.good(f"Loaded {file_name}")
|
||||
return data
|
||||
elif file_path.suffix == ".jsonl":
|
||||
with msg.loading("Loading {}...".format(file_name)):
|
||||
with msg.loading(f"Loading {file_name}..."):
|
||||
data = srsly.read_jsonl(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
msg.good(f"Loaded {file_name}")
|
||||
return data
|
||||
msg.fail(
|
||||
"Can't load file extension {}".format(file_path.suffix),
|
||||
f"Can't load file extension {file_path.suffix}",
|
||||
"Expected .json or .jsonl",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def _compile_gold(train_docs, pipeline, nlp):
|
||||
def _compile_gold(
|
||||
examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
|
||||
) -> Dict[str, Any]:
|
||||
data = {
|
||||
"ner": Counter(),
|
||||
"cats": Counter(),
|
||||
|
@ -598,18 +561,20 @@ def _compile_gold(train_docs, pipeline, nlp):
|
|||
"n_cats_multilabel": 0,
|
||||
"texts": set(),
|
||||
}
|
||||
for doc, gold in train_docs:
|
||||
valid_words = [x for x in gold.words if x is not None]
|
||||
for eg in examples:
|
||||
gold = eg.reference
|
||||
doc = eg.predicted
|
||||
valid_words = [x for x in gold if x is not None]
|
||||
data["words"].update(valid_words)
|
||||
data["n_words"] += len(valid_words)
|
||||
data["n_misaligned_words"] += len(gold.words) - len(valid_words)
|
||||
data["n_misaligned_words"] += len(gold) - len(valid_words)
|
||||
data["texts"].add(doc.text)
|
||||
if len(nlp.vocab.vectors):
|
||||
for word in valid_words:
|
||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||
data["words_missing_vectors"].update([word])
|
||||
if "ner" in pipeline:
|
||||
for i, label in enumerate(gold.ner):
|
||||
for i, label in enumerate(eg.get_aligned_ner()):
|
||||
if label is None:
|
||||
continue
|
||||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||
|
@ -635,40 +600,42 @@ def _compile_gold(train_docs, pipeline, nlp):
|
|||
if list(gold.cats.values()).count(1.0) != 1:
|
||||
data["n_cats_multilabel"] += 1
|
||||
if "tagger" in pipeline:
|
||||
data["tags"].update([x for x in gold.tags if x is not None])
|
||||
tags = eg.get_aligned("TAG", as_string=True)
|
||||
data["tags"].update([x for x in tags if x is not None])
|
||||
if "parser" in pipeline:
|
||||
data["deps"].update([x for x in gold.labels if x is not None])
|
||||
for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)):
|
||||
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
|
||||
data["deps"].update([x for x in aligned_deps if x is not None])
|
||||
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
|
||||
if head == i:
|
||||
data["roots"].update([dep])
|
||||
data["n_sents"] += 1
|
||||
if nonproj.is_nonproj_tree(gold.heads):
|
||||
if nonproj.is_nonproj_tree(aligned_heads):
|
||||
data["n_nonproj"] += 1
|
||||
if nonproj.contains_cycle(gold.heads):
|
||||
if nonproj.contains_cycle(aligned_heads):
|
||||
data["n_cycles"] += 1
|
||||
return data
|
||||
|
||||
|
||||
def _format_labels(labels, counts=False):
|
||||
def _format_labels(labels: List[Tuple[str, int]], counts: bool = False) -> str:
|
||||
if counts:
|
||||
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||
return ", ".join(["'{}'".format(l) for l in labels])
|
||||
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
||||
return ", ".join([f"'{l}'" for l in labels])
|
||||
|
||||
|
||||
def _get_examples_without_label(data, label):
|
||||
def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||
count = 0
|
||||
for doc, gold in data:
|
||||
for eg in data:
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
for label in gold.ner
|
||||
if label is not None and label not in ("O", "-")
|
||||
for label in eg.get_aligned_ner()
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
def _get_labels_from_model(nlp, pipe_name):
|
||||
def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
|
||||
if pipe_name not in nlp.pipe_names:
|
||||
return set()
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
|
|
168
spacy/cli/debug_model.py
Normal file
168
spacy/cli/debug_model.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
|
||||
|
||||
from ._util import Arg, Opt, debug_cli
|
||||
from .. import util
|
||||
from ..lang.en import English
|
||||
|
||||
|
||||
@debug_cli.command("model")
|
||||
def debug_model_cli(
|
||||
# fmt: off
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True),
|
||||
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of pipeline components to train"),
|
||||
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),
|
||||
parameters: bool = Opt(False, "--parameters", "-PAR", help="Show parameters"),
|
||||
gradients: bool = Opt(False, "--gradients", "-GRAD", help="Show gradients"),
|
||||
attributes: bool = Opt(False, "--attributes", "-ATTR", help="Show attributes"),
|
||||
P0: bool = Opt(False, "--print-step0", "-P0", help="Print model before training"),
|
||||
P1: bool = Opt(False, "--print-step1", "-P1", help="Print model after initialization"),
|
||||
P2: bool = Opt(False, "--print-step2", "-P2", help="Print model after training"),
|
||||
P3: bool = Opt(True, "--print-step3", "-P3", help="Print final predictions"),
|
||||
use_gpu: int = Opt(-1, "--use-gpu", "-g", help="Use GPU"),
|
||||
seed: int = Opt(None, "--seed", "-s", help="Use GPU"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Analyze a Thinc model implementation. Includes checks for internal structure
|
||||
and activations during training.
|
||||
"""
|
||||
print_settings = {
|
||||
"dimensions": dimensions,
|
||||
"parameters": parameters,
|
||||
"gradients": gradients,
|
||||
"attributes": attributes,
|
||||
"layers": [int(x.strip()) for x in layers.split(",")] if layers else [],
|
||||
"print_before_training": P0,
|
||||
"print_after_init": P1,
|
||||
"print_after_training": P2,
|
||||
"print_prediction": P3,
|
||||
}
|
||||
|
||||
if seed is not None:
|
||||
msg.info(f"Fixing random seed: {seed}")
|
||||
fix_random_seed(seed)
|
||||
if use_gpu >= 0:
|
||||
msg.info(f"Using GPU: {use_gpu}")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info(f"Using CPU")
|
||||
|
||||
debug_model(
|
||||
config_path, print_settings=print_settings,
|
||||
)
|
||||
|
||||
|
||||
def debug_model(config_path: Path, *, print_settings=None):
|
||||
if print_settings is None:
|
||||
print_settings = {}
|
||||
|
||||
model = util.load_config(config_path, create_objects=True)["model"]
|
||||
|
||||
# STEP 0: Printing before training
|
||||
msg.info(f"Analysing model with ID {model.id}")
|
||||
if print_settings.get("print_before_training"):
|
||||
msg.info(f"Before training:")
|
||||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 1: Initializing the model and printing again
|
||||
model.initialize(X=_get_docs(), Y=_get_output(model.ops.xp))
|
||||
if print_settings.get("print_after_init"):
|
||||
msg.info(f"After initialization:")
|
||||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 2: Updating the model and printing again
|
||||
optimizer = Adam(0.001)
|
||||
set_dropout_rate(model, 0.2)
|
||||
for e in range(3):
|
||||
Y, get_dX = model.begin_update(_get_docs())
|
||||
dY = get_gradient(model, Y)
|
||||
get_dX(dY)
|
||||
model.finish_update(optimizer)
|
||||
if print_settings.get("print_after_training"):
|
||||
msg.info(f"After training:")
|
||||
_print_model(model, print_settings)
|
||||
|
||||
# STEP 3: the final prediction
|
||||
prediction = model.predict(_get_docs())
|
||||
if print_settings.get("print_prediction"):
|
||||
msg.info(f"Prediction:", str(prediction))
|
||||
|
||||
|
||||
def get_gradient(model, Y):
|
||||
goldY = _get_output(model.ops.xp)
|
||||
return Y - goldY
|
||||
|
||||
|
||||
def _sentences():
|
||||
return [
|
||||
"Apple is looking at buying U.K. startup for $1 billion",
|
||||
"Autonomous cars shift insurance liability toward manufacturers",
|
||||
"San Francisco considers banning sidewalk delivery robots",
|
||||
"London is a big city in the United Kingdom.",
|
||||
]
|
||||
|
||||
|
||||
def _get_docs():
|
||||
nlp = English()
|
||||
return list(nlp.pipe(_sentences()))
|
||||
|
||||
|
||||
def _get_output(xp):
|
||||
return xp.asarray(
|
||||
[
|
||||
xp.asarray([i + 10, i + 20, i + 30], dtype="float32")
|
||||
for i, _ in enumerate(_get_docs())
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _print_model(model, print_settings):
|
||||
layers = print_settings.get("layers", "")
|
||||
parameters = print_settings.get("parameters", False)
|
||||
dimensions = print_settings.get("dimensions", False)
|
||||
gradients = print_settings.get("gradients", False)
|
||||
attributes = print_settings.get("attributes", False)
|
||||
|
||||
for i, node in enumerate(model.walk()):
|
||||
if not layers or i in layers:
|
||||
msg.info(f"Layer {i}: model ID {node.id}: '{node.name}'")
|
||||
|
||||
if dimensions:
|
||||
for name in node.dim_names:
|
||||
if node.has_dim(name):
|
||||
msg.info(f" - dim {name}: {node.get_dim(name)}")
|
||||
else:
|
||||
msg.info(f" - dim {name}: {node.has_dim(name)}")
|
||||
|
||||
if parameters:
|
||||
for name in node.param_names:
|
||||
if node.has_param(name):
|
||||
print_value = _print_matrix(node.get_param(name))
|
||||
msg.info(f" - param {name}: {print_value}")
|
||||
else:
|
||||
msg.info(f" - param {name}: {node.has_param(name)}")
|
||||
if gradients:
|
||||
for name in node.param_names:
|
||||
if node.has_grad(name):
|
||||
print_value = _print_matrix(node.get_grad(name))
|
||||
msg.info(f" - grad {name}: {print_value}")
|
||||
else:
|
||||
msg.info(f" - grad {name}: {node.has_grad(name)}")
|
||||
if attributes:
|
||||
attrs = node.attrs
|
||||
for name, value in attrs.items():
|
||||
msg.info(f" - attr {name}: {value}")
|
||||
|
||||
|
||||
def _print_matrix(value):
|
||||
if value is None or isinstance(value, bool):
|
||||
return value
|
||||
result = str(value.shape) + " - sample: "
|
||||
sample_matrix = value
|
||||
for d in range(value.ndim - 1):
|
||||
sample_matrix = sample_matrix[0]
|
||||
sample_matrix = sample_matrix[0:5]
|
||||
result = result + str(sample_matrix)
|
||||
return result
|
|
@ -1,30 +1,54 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from typing import Optional, Sequence
|
||||
import requests
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from wasabi import msg
|
||||
import typer
|
||||
|
||||
from .link import link
|
||||
from ..util import get_package_path
|
||||
from ._util import app, Arg, Opt
|
||||
from .. import about
|
||||
from ..util import is_package, get_base_version, run_command
|
||||
|
||||
# These are the old shortcuts we previously supported in spacy download. As of
|
||||
# v3, shortcuts are deprecated so we're not expecting to add anything to this
|
||||
# list. It only exists to show users warnings.
|
||||
OLD_SHORTCUTS = {
|
||||
"en": "en_core_web_sm",
|
||||
"de": "de_core_news_sm",
|
||||
"es": "es_core_news_sm",
|
||||
"pt": "pt_core_news_sm",
|
||||
"fr": "fr_core_news_sm",
|
||||
"it": "it_core_news_sm",
|
||||
"nl": "nl_core_news_sm",
|
||||
"el": "el_core_news_sm",
|
||||
"nb": "nb_core_news_sm",
|
||||
"lt": "lt_core_news_sm",
|
||||
"xx": "xx_ent_wiki_sm",
|
||||
}
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||
pip_args=("Additional arguments to be passed to `pip install` on model install"),
|
||||
@app.command(
|
||||
"download",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def download(model, direct=False, *pip_args):
|
||||
def download_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context,
|
||||
model: str = Arg(..., help="Name of model to download"),
|
||||
direct: bool = Opt(False, "--direct", "-d", "-D", help="Force direct download of name + version"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version. For direct downloads, the compatibility check will be skipped.
|
||||
Download compatible model from default download path using pip. If --direct
|
||||
flag is set, the command expects the full model name with version.
|
||||
For direct downloads, the compatibility check will be skipped. All
|
||||
additional arguments provided to this command will be passed to `pip install`
|
||||
on model installation.
|
||||
"""
|
||||
if not require_package("spacy") and "--no-deps" not in pip_args:
|
||||
download(model, direct, *ctx.args)
|
||||
|
||||
|
||||
def download(model: str, direct: bool = False, *pip_args) -> None:
|
||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||
msg.warn(
|
||||
"Skipping model package dependencies and setting `--no-deps`. "
|
||||
"You don't seem to have the spaCy package itself installed "
|
||||
|
@ -39,97 +63,59 @@ def download(model, direct=False, *pip_args):
|
|||
components = model.split("-")
|
||||
model_name = "".join(components[:-1])
|
||||
version = components[-1]
|
||||
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
else:
|
||||
shortcuts = get_json(about.__shortcuts__, "available shortcuts")
|
||||
model_name = shortcuts.get(model, model)
|
||||
model_name = model
|
||||
if model in OLD_SHORTCUTS:
|
||||
msg.warn(
|
||||
f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. "
|
||||
f"Please use the full model name '{OLD_SHORTCUTS[model]}' instead."
|
||||
)
|
||||
model_name = OLD_SHORTCUTS[model]
|
||||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
dl = download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
||||
sys.exit(dl)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
"You can now load the model via spacy.load('{}')".format(model_name),
|
||||
)
|
||||
# Only create symlink if the model is installed via a shortcut like 'en'.
|
||||
# There's no real advantage over an additional symlink for en_core_web_sm
|
||||
# and if anything, it's more error prone and causes more confusion.
|
||||
if model in shortcuts:
|
||||
try:
|
||||
# Get package path here because link uses
|
||||
# pip.get_installed_distributions() to check if model is a
|
||||
# package, which fails if model was just installed via
|
||||
# subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(model_name, model, force=True, model_path=package_path)
|
||||
except: # noqa: E722
|
||||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
msg.warn(
|
||||
"Download successful but linking failed",
|
||||
"Creating a shortcut link for '{}' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name: "
|
||||
"nlp = spacy.load('{}')".format(model, model_name),
|
||||
)
|
||||
# If a model is downloaded and then loaded within the same process, our
|
||||
# is_package check currently fails, because pkg_resources.working_set
|
||||
# is not refreshed automatically (see #3923). We're trying to work
|
||||
# around this here be requiring the package explicitly.
|
||||
require_package(model_name)
|
||||
download_model(dl_tpl.format(m=model_name, v=version), pip_args)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
f"You can now load the model via spacy.load('{model_name}')",
|
||||
)
|
||||
|
||||
|
||||
def require_package(name):
|
||||
try:
|
||||
import pkg_resources
|
||||
|
||||
pkg_resources.working_set.require(name)
|
||||
return True
|
||||
except: # noqa: E722
|
||||
return False
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
r = requests.get(url)
|
||||
def get_compatibility() -> dict:
|
||||
version = get_base_version(about.__version__)
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
"Server error ({})".format(r.status_code),
|
||||
"Couldn't fetch {}. Please find a model for your spaCy "
|
||||
"installation (v{}), and download it manually. For more "
|
||||
"details, see the documentation: "
|
||||
"https://spacy.io/usage/models".format(desc, about.__version__),
|
||||
f"Server error ({r.status_code})",
|
||||
f"Couldn't fetch compatibility table. Please find a model for your spaCy "
|
||||
f"installation (v{about.__version__}), and download it manually. "
|
||||
f"For more details, see the documentation: "
|
||||
f"https://spacy.io/usage/models",
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_compatibility():
|
||||
version = about.__version__
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp_table = r.json()
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
|
||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
def get_version(model, comp):
|
||||
model = model.rsplit(".dev", 1)[0]
|
||||
def get_version(model: str, comp: dict) -> str:
|
||||
model = get_base_version(model)
|
||||
if model not in comp:
|
||||
msg.fail(
|
||||
"No compatible model found for '{}' "
|
||||
"(spaCy v{}).".format(model, about.__version__),
|
||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
||||
exits=1,
|
||||
)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
def download_model(filename, user_pip_args=None):
|
||||
def download_model(
|
||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||
) -> None:
|
||||
download_url = about.__download_url__ + "/" + filename
|
||||
pip_args = ["--no-cache-dir"]
|
||||
if user_pip_args:
|
||||
pip_args.extend(user_pip_args)
|
||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||
return subprocess.call(cmd, env=os.environ.copy())
|
||||
run_command(cmd)
|
||||
|
|
|
@ -1,74 +1,111 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
from typing import Optional, List, Dict
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import msg
|
||||
from wasabi import Printer
|
||||
from pathlib import Path
|
||||
import re
|
||||
import srsly
|
||||
from thinc.api import require_gpu, fix_random_seed
|
||||
|
||||
from ..gold import GoldCorpus
|
||||
from ..gold import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
||||
return_scores=("Return dict containing model scores", "flag", "R", bool),
|
||||
)
|
||||
def evaluate(
|
||||
model,
|
||||
data_path,
|
||||
gpu_id=-1,
|
||||
gold_preproc=False,
|
||||
displacy_path=None,
|
||||
displacy_limit=25,
|
||||
return_scores=False,
|
||||
@app.command("evaluate")
|
||||
def evaluate_cli(
|
||||
# fmt: off
|
||||
model: str = Arg(..., help="Model name or path"),
|
||||
data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
|
||||
output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
|
||||
gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
|
||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
output directory as the displacy_path argument.
|
||||
"""
|
||||
util.fix_random_seed()
|
||||
evaluate(
|
||||
model,
|
||||
data_path,
|
||||
output=output,
|
||||
gpu_id=gpu_id,
|
||||
gold_preproc=gold_preproc,
|
||||
displacy_path=displacy_path,
|
||||
displacy_limit=displacy_limit,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def evaluate(
|
||||
model: str,
|
||||
data_path: Path,
|
||||
output: Optional[Path],
|
||||
gpu_id: int = -1,
|
||||
gold_preproc: bool = False,
|
||||
displacy_path: Optional[Path] = None,
|
||||
displacy_limit: int = 25,
|
||||
silent: bool = True,
|
||||
) -> Scorer:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
if gpu_id >= 0:
|
||||
util.use_gpu(gpu_id)
|
||||
require_gpu(gpu_id)
|
||||
util.set_env_log(False)
|
||||
data_path = util.ensure_path(data_path)
|
||||
output_path = util.ensure_path(output)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
if not data_path.exists():
|
||||
msg.fail("Evaluation data not found", data_path, exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
if model.startswith("blank:"):
|
||||
nlp = util.get_lang_class(model.replace("blank:", ""))()
|
||||
else:
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
corpus = Corpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
|
||||
begin = timer()
|
||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
scorer = nlp.evaluate(dev_dataset, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
nwords = sum(len(ex.predicted) for ex in dev_dataset)
|
||||
results = {
|
||||
"Time": "%.2f s" % (end - begin),
|
||||
"Time": f"{end - begin:.2f} s",
|
||||
"Words": nwords,
|
||||
"Words/s": "%.0f" % (nwords / (end - begin)),
|
||||
"TOK": "%.2f" % scorer.token_acc,
|
||||
"POS": "%.2f" % scorer.tags_acc,
|
||||
"UAS": "%.2f" % scorer.uas,
|
||||
"LAS": "%.2f" % scorer.las,
|
||||
"NER P": "%.2f" % scorer.ents_p,
|
||||
"NER R": "%.2f" % scorer.ents_r,
|
||||
"NER F": "%.2f" % scorer.ents_f,
|
||||
"Textcat": "%.2f" % scorer.textcat_score,
|
||||
"Words/s": f"{nwords / (end - begin):.0f}",
|
||||
"TOK": f"{scorer.token_acc:.2f}",
|
||||
"TAG": f"{scorer.tags_acc:.2f}",
|
||||
"POS": f"{scorer.pos_acc:.2f}",
|
||||
"MORPH": f"{scorer.morphs_acc:.2f}",
|
||||
"UAS": f"{scorer.uas:.2f}",
|
||||
"LAS": f"{scorer.las:.2f}",
|
||||
"NER P": f"{scorer.ents_p:.2f}",
|
||||
"NER R": f"{scorer.ents_r:.2f}",
|
||||
"NER F": f"{scorer.ents_f:.2f}",
|
||||
"Textcat AUC": f"{scorer.textcat_auc:.2f}",
|
||||
"Textcat F": f"{scorer.textcat_f:.2f}",
|
||||
"Sent P": f"{scorer.sent_p:.2f}",
|
||||
"Sent R": f"{scorer.sent_r:.2f}",
|
||||
"Sent F": f"{scorer.sent_f:.2f}",
|
||||
}
|
||||
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
|
||||
|
||||
msg.table(results, title="Results")
|
||||
|
||||
if scorer.ents_per_type:
|
||||
data["ents_per_type"] = scorer.ents_per_type
|
||||
print_ents_per_type(msg, scorer.ents_per_type)
|
||||
if scorer.textcats_f_per_cat:
|
||||
data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
|
||||
print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
|
||||
if scorer.textcats_auc_per_cat:
|
||||
data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
|
||||
print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
|
||||
|
||||
if displacy_path:
|
||||
docs, golds = zip(*dev_docs)
|
||||
docs = [ex.predicted for ex in dev_dataset]
|
||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||
render_parses(
|
||||
|
@ -79,12 +116,22 @@ def evaluate(
|
|||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
)
|
||||
msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
|
||||
if return_scores:
|
||||
return scorer.scores
|
||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||
|
||||
if output_path is not None:
|
||||
srsly.write_json(output_path, data)
|
||||
msg.good(f"Saved results to {output_path}")
|
||||
return data
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=True):
|
||||
def render_parses(
|
||||
docs: List[Doc],
|
||||
output_path: Path,
|
||||
model_name: str = "",
|
||||
limit: int = 250,
|
||||
deps: bool = True,
|
||||
ents: bool = True,
|
||||
):
|
||||
docs[0].user_data["title"] = model_name
|
||||
if ents:
|
||||
html = displacy.render(docs[:limit], style="ent", page=True)
|
||||
|
@ -96,3 +143,40 @@ def render_parses(docs, output_path, model_name="", limit=250, deps=True, ents=T
|
|||
)
|
||||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
||||
data = [
|
||||
(k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
|
||||
for k, v in scores.items()
|
||||
]
|
||||
msg.table(
|
||||
data,
|
||||
header=("", "P", "R", "F"),
|
||||
aligns=("l", "r", "r", "r"),
|
||||
title="NER (per type)",
|
||||
)
|
||||
|
||||
|
||||
def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
|
||||
data = [
|
||||
(k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
|
||||
for k, v in scores.items()
|
||||
]
|
||||
msg.table(
|
||||
data,
|
||||
header=("", "P", "R", "F"),
|
||||
aligns=("l", "r", "r", "r"),
|
||||
title="Textcat F (per type)",
|
||||
)
|
||||
|
||||
|
||||
def print_textcats_auc_per_cat(
|
||||
msg: Printer, scores: Dict[str, Dict[str, float]]
|
||||
) -> None:
|
||||
msg.table(
|
||||
[(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
|
||||
header=("", "ROC AUC"),
|
||||
aligns=("l", "r"),
|
||||
title="Textcat ROC AUC (per label)",
|
||||
)
|
||||
|
|
|
@ -1,92 +1,109 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from typing import Optional, Dict, Any, Union
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str, basestring_, unicode_
|
||||
from ._util import app, Arg, Opt
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Optional shortcut link of model", "positional", None, str),
|
||||
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
||||
silent=("Don't print anything (just return)", "flag", "s"),
|
||||
)
|
||||
def info(model=None, markdown=False, silent=False):
|
||||
@app.command("info")
|
||||
def info_cli(
|
||||
# fmt: off
|
||||
model: Optional[str] = Arg(None, help="Optional model name"),
|
||||
markdown: bool = Opt(False, "--markdown", "-md", help="Generate Markdown for GitHub issues"),
|
||||
silent: bool = Opt(False, "--silent", "-s", "-S", help="Don't print anything (just return)"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
Print info about spaCy installation. If a model is speficied as an argument,
|
||||
print model information. Flag --markdown prints details in Markdown for easy
|
||||
copy-pasting to GitHub issues.
|
||||
"""
|
||||
info(model, markdown=markdown, silent=silent)
|
||||
|
||||
|
||||
def info(
|
||||
model: Optional[str] = None, *, markdown: bool = False, silent: bool = True
|
||||
) -> Union[str, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if model:
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta["link"] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = path2str(model_path)
|
||||
title = f"Info about model '{model}'"
|
||||
data = info_model(model, silent=silent)
|
||||
else:
|
||||
title = "Info about spaCy"
|
||||
data = info_spacy()
|
||||
raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
|
||||
if "Models" in data and isinstance(data["Models"], dict):
|
||||
data["Models"] = ", ".join(f"{n} ({v})" for n, v in data["Models"].items())
|
||||
markdown_data = get_markdown(data, title=title)
|
||||
if markdown:
|
||||
if not silent:
|
||||
title = "Info about model '{}'".format(model)
|
||||
model_meta = {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||
}
|
||||
if markdown:
|
||||
print_markdown(model_meta, title=title)
|
||||
else:
|
||||
msg.table(model_meta, title=title)
|
||||
return meta
|
||||
data = {
|
||||
print(markdown_data)
|
||||
return markdown_data
|
||||
if not silent:
|
||||
table_data = dict(data)
|
||||
msg.table(table_data, title=title)
|
||||
return raw_data
|
||||
|
||||
|
||||
def info_spacy() -> Dict[str, any]:
|
||||
"""Generate info about the current spaCy intallation.
|
||||
|
||||
RETURNS (dict): The spaCy info.
|
||||
"""
|
||||
all_models = {}
|
||||
for pkg_name in util.get_installed_models():
|
||||
package = pkg_name.replace("-", "_")
|
||||
all_models[package] = util.get_package_version(pkg_name)
|
||||
return {
|
||||
"spaCy version": about.__version__,
|
||||
"Location": path2str(Path(__file__).parent.parent),
|
||||
"Location": str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": list_models(),
|
||||
"Models": all_models,
|
||||
}
|
||||
if not silent:
|
||||
title = "Info about spaCy"
|
||||
if markdown:
|
||||
print_markdown(data, title=title)
|
||||
else:
|
||||
msg.table(data, title=title)
|
||||
return data
|
||||
|
||||
|
||||
def list_models():
|
||||
def exclude_dir(dir_name):
|
||||
# exclude common cache directories and hidden directories
|
||||
exclude = ("cache", "pycache", "__pycache__")
|
||||
return dir_name in exclude or dir_name.startswith(".")
|
||||
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
|
||||
"""Generate info about a specific model.
|
||||
|
||||
data_path = util.get_data_path()
|
||||
if data_path:
|
||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||
return "-"
|
||||
model (str): Model name of path.
|
||||
silent (bool): Don't print anything, just return.
|
||||
RETURNS (dict): The model meta.
|
||||
"""
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = model
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta["link"] = str(model_path)
|
||||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = str(model_path)
|
||||
return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
|
||||
|
||||
|
||||
def print_markdown(data, title=None):
|
||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
|
||||
"""Get data in GitHub-flavoured Markdown format for issues etc.
|
||||
|
||||
data (dict or list of tuples): Label/value pairs.
|
||||
title (unicode or None): Title, will be rendered as headline 2.
|
||||
title (str / None): Title, will be rendered as headline 2.
|
||||
RETURNS (str): The Markdown string.
|
||||
"""
|
||||
markdown = []
|
||||
for key, value in data.items():
|
||||
if isinstance(value, basestring_) and Path(value).exists():
|
||||
if isinstance(value, str) and Path(value).exists():
|
||||
continue
|
||||
markdown.append("* **{}:** {}".format(key, unicode_(value)))
|
||||
markdown.append(f"* **{key}:** {value}")
|
||||
result = "\n{}\n".format("\n".join(markdown))
|
||||
if title:
|
||||
print("\n## {}".format(title))
|
||||
print("\n{}\n".format("\n".join(markdown)))
|
||||
result = f"\n## {title}\n{result}"
|
||||
return result
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from typing import Optional, List, Dict, Any, Union, IO
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
import numpy
|
||||
|
@ -13,14 +10,15 @@ import gzip
|
|||
import zipfile
|
||||
import srsly
|
||||
import warnings
|
||||
from wasabi import msg
|
||||
from wasabi import Printer
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Errors, Warnings
|
||||
from ..language import Language
|
||||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||
from ..lookups import Lookups
|
||||
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
except ImportError:
|
||||
|
@ -30,49 +28,60 @@ except ImportError:
|
|||
DEFAULT_OOV_PROB = -20
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_dir=("Model output directory", "positional", None, Path),
|
||||
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||
truncate_vectors=(
|
||||
"Optional number of vectors to truncate to when reading in vectors file",
|
||||
"option",
|
||||
"t",
|
||||
int,
|
||||
),
|
||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||
vectors_name=(
|
||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
||||
"option",
|
||||
"vn",
|
||||
str,
|
||||
),
|
||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
|
||||
)
|
||||
def init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
freqs_loc=None,
|
||||
clusters_loc=None,
|
||||
jsonl_loc=None,
|
||||
vectors_loc=None,
|
||||
truncate_vectors=0,
|
||||
prune_vectors=-1,
|
||||
vectors_name=None,
|
||||
model_name=None,
|
||||
omit_extra_lookups=False,
|
||||
base_model=None,
|
||||
@app.command("init-model")
|
||||
def init_model_cli(
|
||||
# fmt: off
|
||||
lang: str = Arg(..., help="Model language"),
|
||||
output_dir: Path = Arg(..., help="Model output directory"),
|
||||
freqs_loc: Optional[Path] = Arg(None, help="Location of words frequencies file", exists=True),
|
||||
clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
|
||||
vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
|
||||
prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
|
||||
truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
|
||||
vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
|
||||
omit_extra_lookups: bool = Opt(False, "--omit-extra-lookups", "-OEL", help="Don't include extra lookups in model"),
|
||||
base_model: Optional[str] = Opt(None, "--base-model", "-b", help="Base model (for languages with custom tokenizers)")
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Create a new model from raw data, like word frequencies, Brown clusters
|
||||
and word vectors. If vectors are provided in Word2Vec format, they can
|
||||
be either a .txt or zipped as a .zip or .tar.gz.
|
||||
Create a new model from raw data. If vectors are provided in Word2Vec format,
|
||||
they can be either a .txt or zipped as a .zip or .tar.gz.
|
||||
"""
|
||||
init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
freqs_loc=freqs_loc,
|
||||
clusters_loc=clusters_loc,
|
||||
jsonl_loc=jsonl_loc,
|
||||
vectors_loc=vectors_loc,
|
||||
prune_vectors=prune_vectors,
|
||||
truncate_vectors=truncate_vectors,
|
||||
vectors_name=vectors_name,
|
||||
model_name=model_name,
|
||||
omit_extra_lookups=omit_extra_lookups,
|
||||
base_model=base_model,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def init_model(
|
||||
lang: str,
|
||||
output_dir: Path,
|
||||
freqs_loc: Optional[Path] = None,
|
||||
clusters_loc: Optional[Path] = None,
|
||||
jsonl_loc: Optional[Path] = None,
|
||||
vectors_loc: Optional[Path] = None,
|
||||
prune_vectors: int = -1,
|
||||
truncate_vectors: int = 0,
|
||||
vectors_name: Optional[str] = None,
|
||||
model_name: Optional[str] = None,
|
||||
omit_extra_lookups: bool = False,
|
||||
base_model: Optional[str] = None,
|
||||
silent: bool = True,
|
||||
) -> Language:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
if jsonl_loc is not None:
|
||||
if freqs_loc is not None or clusters_loc is not None:
|
||||
settings = ["-j"]
|
||||
|
@ -95,7 +104,7 @@ def init_model(
|
|||
freqs_loc = ensure_path(freqs_loc)
|
||||
if freqs_loc is not None and not freqs_loc.exists():
|
||||
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
|
||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||
lex_attrs = read_attrs_from_deprecated(msg, freqs_loc, clusters_loc)
|
||||
|
||||
with msg.loading("Creating model..."):
|
||||
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||
|
@ -110,12 +119,13 @@ def init_model(
|
|||
|
||||
msg.good("Successfully created model")
|
||||
if vectors_loc is not None:
|
||||
add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name)
|
||||
add_vectors(
|
||||
msg, nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name
|
||||
)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
msg.good(
|
||||
"Sucessfully compiled vocab",
|
||||
"{} entries, {} vectors".format(lex_added, vec_added),
|
||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
|
@ -123,7 +133,7 @@ def init_model(
|
|||
return nlp
|
||||
|
||||
|
||||
def open_file(loc):
|
||||
def open_file(loc: Union[str, Path]) -> IO:
|
||||
"""Handle .gz, .tar.gz or unzipped files"""
|
||||
loc = ensure_path(loc)
|
||||
if tarfile.is_tarfile(str(loc)):
|
||||
|
@ -139,7 +149,9 @@ def open_file(loc):
|
|||
return loc.open("r", encoding="utf8")
|
||||
|
||||
|
||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||
def read_attrs_from_deprecated(
|
||||
msg: Printer, freqs_loc: Optional[Path], clusters_loc: Optional[Path]
|
||||
) -> List[Dict[str, Any]]:
|
||||
if freqs_loc is not None:
|
||||
with msg.loading("Counting frequencies..."):
|
||||
probs, _ = read_freqs(freqs_loc)
|
||||
|
@ -167,7 +179,12 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
|||
return lex_attrs
|
||||
|
||||
|
||||
def create_model(lang, lex_attrs, name=None, base_model=None):
|
||||
def create_model(
|
||||
lang: str,
|
||||
lex_attrs: List[Dict[str, Any]],
|
||||
name: Optional[str] = None,
|
||||
base_model: Optional[Union[str, Path]] = None,
|
||||
) -> Language:
|
||||
if base_model:
|
||||
nlp = load_model(base_model)
|
||||
# keep the tokenizer but remove any existing pipeline components due to
|
||||
|
@ -194,7 +211,14 @@ def create_model(lang, lex_attrs, name=None, base_model=None):
|
|||
return nlp
|
||||
|
||||
|
||||
def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
||||
def add_vectors(
|
||||
msg: Printer,
|
||||
nlp: Language,
|
||||
vectors_loc: Optional[Path],
|
||||
truncate_vectors: int,
|
||||
prune_vectors: int,
|
||||
name: Optional[str] = None,
|
||||
) -> None:
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||
nlp.vocab.vectors = Vectors(data=numpy.load(vectors_loc.open("rb")))
|
||||
|
@ -203,9 +227,11 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
|||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||
else:
|
||||
if vectors_loc:
|
||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
|
||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||
vectors_data, vector_keys = read_vectors(
|
||||
msg, vectors_loc, truncate_vectors
|
||||
)
|
||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None:
|
||||
|
@ -215,7 +241,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
|||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if name is None:
|
||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
|
@ -223,7 +249,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
|||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
||||
|
||||
def read_vectors(vectors_loc, truncate_vectors=0):
|
||||
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
if truncate_vectors >= 1:
|
||||
|
@ -243,7 +269,9 @@ def read_vectors(vectors_loc, truncate_vectors=0):
|
|||
return vectors_data, vectors_keys
|
||||
|
||||
|
||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||
def read_freqs(
|
||||
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
|
||||
):
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
with freqs_loc.open() as f:
|
||||
|
@ -265,14 +293,14 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
word = literal_eval(key)
|
||||
except SyntaxError:
|
||||
# Take odd strings literally.
|
||||
word = literal_eval("'%s'" % key)
|
||||
word = literal_eval(f"'{key}'")
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||
return probs, oov_prob
|
||||
|
||||
|
||||
def read_clusters(clusters_loc):
|
||||
def read_clusters(clusters_loc: Path) -> dict:
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
warnings.warn(Warnings.W004)
|
||||
|
|
|
@ -1,77 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import symlink_to, path2str
|
||||
from .. import util
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool),
|
||||
)
|
||||
def link(origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||
if not model_path.exists():
|
||||
msg.fail(
|
||||
"Can't locate model data",
|
||||
"The data should be located in {}".format(path2str(model_path)),
|
||||
exits=1,
|
||||
)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
msg.fail(
|
||||
"Can't find the spaCy data path to create model symlink",
|
||||
"Make sure a directory `/data` exists within your spaCy "
|
||||
"installation and try again. The data directory should be located "
|
||||
"here:".format(path=spacy_loc),
|
||||
exits=1,
|
||||
)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.is_symlink() and not force:
|
||||
msg.fail(
|
||||
"Link '{}' already exists".format(link_name),
|
||||
"To overwrite an existing link, use the --force flag",
|
||||
exits=1,
|
||||
)
|
||||
elif link_path.is_symlink(): # does a symlink exist?
|
||||
# NB: It's important to check for is_symlink here and not for exists,
|
||||
# because invalid/outdated symlinks would return False otherwise.
|
||||
link_path.unlink()
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
# NB: Check this last because valid symlinks also "exist".
|
||||
msg.fail(
|
||||
"Can't overwrite symlink '{}'".format(link_name),
|
||||
"This can happen if your data directory contains a directory or "
|
||||
"file of the same name.",
|
||||
exits=1,
|
||||
)
|
||||
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except: # noqa: E722
|
||||
# This is quite dirty, but just making sure other errors are caught.
|
||||
msg.fail(
|
||||
"Couldn't link model to '{}'".format(link_name),
|
||||
"Creating a symlink in spacy/data failed. Make sure you have the "
|
||||
"required permissions and try re-running the command as admin, or "
|
||||
"use a virtualenv. You can still import the model as a module and "
|
||||
"call its load() method, or create the symlink manually.",
|
||||
)
|
||||
msg.text(details)
|
||||
raise
|
||||
msg.good("Linking successful", details)
|
||||
msg.text("You can now load the model via spacy.load('{}')".format(link_name))
|
|
@ -1,32 +1,57 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from typing import Optional, Union, Any, Dict
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import msg, get_raw_input
|
||||
from wasabi import Printer, get_raw_input
|
||||
import srsly
|
||||
import sys
|
||||
|
||||
from ..compat import path2str
|
||||
from ._util import app, Arg, Opt
|
||||
from ..schemas import validate, ModelMetaSchema
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_dir=("Directory with model data", "positional", None, str),
|
||||
output_dir=("Output parent directory", "positional", None, str),
|
||||
meta_path=("Path to meta.json", "option", "m", str),
|
||||
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
||||
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
||||
)
|
||||
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
@app.command("package")
|
||||
def package_cli(
|
||||
# fmt: off
|
||||
input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
|
||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over. If --create-meta is
|
||||
set and a meta.json already exists in the output directory, the existing
|
||||
values will be used as the defaults in the command-line prompt.
|
||||
Generate an installable Python package for a model. Includes model data,
|
||||
meta and required installation files. A new directory will be created in the
|
||||
specified output directory, and model data will be copied over. If
|
||||
--create-meta is set and a meta.json already exists in the output directory,
|
||||
the existing values will be used as the defaults in the command-line prompt.
|
||||
After packaging, "python setup.py sdist" is run in the package directory,
|
||||
which will create a .tar.gz archive that can be installed via "pip install".
|
||||
"""
|
||||
package(
|
||||
input_dir,
|
||||
output_dir,
|
||||
meta_path=meta_path,
|
||||
version=version,
|
||||
create_meta=create_meta,
|
||||
force=force,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
||||
def package(
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
meta_path: Optional[Path] = None,
|
||||
version: Optional[str] = None,
|
||||
create_meta: bool = False,
|
||||
force: bool = False,
|
||||
silent: bool = True,
|
||||
) -> None:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
input_path = util.ensure_path(input_dir)
|
||||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
|
@ -37,65 +62,69 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
|||
if meta_path and not meta_path.exists():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
|
||||
meta_path = meta_path or input_path / "meta.json"
|
||||
if meta_path.is_file():
|
||||
meta = srsly.read_json(meta_path)
|
||||
if not create_meta: # only print if user doesn't want to overwrite
|
||||
msg.good("Loaded meta.json from file", meta_path)
|
||||
else:
|
||||
meta = generate_meta(input_dir, meta, msg)
|
||||
for key in ("lang", "name", "version"):
|
||||
if key not in meta or meta[key] == "":
|
||||
msg.fail(
|
||||
"No '{}' setting found in meta.json".format(key),
|
||||
"This setting is required to build your package.",
|
||||
exits=1,
|
||||
)
|
||||
meta_path = meta_path or input_dir / "meta.json"
|
||||
if not meta_path.exists() or not meta_path.is_file():
|
||||
msg.fail("Can't load model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
meta = get_meta(input_dir, meta)
|
||||
if version is not None:
|
||||
meta["version"] = version
|
||||
if not create_meta: # only print if user doesn't want to overwrite
|
||||
msg.good("Loaded meta.json from file", meta_path)
|
||||
else:
|
||||
meta = generate_meta(meta, msg)
|
||||
errors = validate(ModelMetaSchema, meta)
|
||||
if errors:
|
||||
msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
|
||||
model_name = meta["lang"] + "_" + meta["name"]
|
||||
model_name_v = model_name + "-" + meta["version"]
|
||||
main_path = output_path / model_name_v
|
||||
main_path = output_dir / model_name_v
|
||||
package_path = main_path / model_name
|
||||
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
shutil.rmtree(str(package_path))
|
||||
else:
|
||||
msg.fail(
|
||||
"Package directory already exists",
|
||||
"Please delete the directory and try again, or use the "
|
||||
"`--force` flag to overwrite existing "
|
||||
"directories.".format(path=path2str(package_path)),
|
||||
"`--force` flag to overwrite existing directories.",
|
||||
exits=1,
|
||||
)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
shutil.copytree(str(input_dir), str(package_path / model_name_v))
|
||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||
msg.good("Successfully created package '{}'".format(model_name_v), main_path)
|
||||
msg.text("To build the package, run `python setup.py sdist` in this directory.")
|
||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
with util.working_dir(main_path):
|
||||
util.run_command([sys.executable, "setup.py", "sdist"])
|
||||
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
|
||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||
|
||||
|
||||
def create_file(file_path, contents):
|
||||
def create_file(file_path: Path, contents: str) -> None:
|
||||
file_path.touch()
|
||||
file_path.open("w", encoding="utf-8").write(contents)
|
||||
|
||||
|
||||
def generate_meta(model_path, existing_meta, msg):
|
||||
meta = existing_meta or {}
|
||||
settings = [
|
||||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
|
||||
("description", "Model description", meta.get("description", False)),
|
||||
("author", "Author", meta.get("author", False)),
|
||||
("email", "Author email", meta.get("email", False)),
|
||||
("url", "Author website", meta.get("url", False)),
|
||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||
]
|
||||
def get_meta(
|
||||
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
meta = {
|
||||
"lang": "en",
|
||||
"name": "model",
|
||||
"version": "0.0.0",
|
||||
"description": None,
|
||||
"author": None,
|
||||
"email": None,
|
||||
"url": None,
|
||||
"license": "MIT",
|
||||
}
|
||||
meta.update(existing_meta)
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
|
@ -103,6 +132,23 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
"keys": nlp.vocab.vectors.n_keys,
|
||||
"name": nlp.vocab.vectors.name,
|
||||
}
|
||||
if about.__title__ != "spacy":
|
||||
meta["parent_package"] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
|
||||
meta = existing_meta or {}
|
||||
settings = [
|
||||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("description", "Model description", meta.get("description", None)),
|
||||
("author", "Author", meta.get("author", None)),
|
||||
("email", "Author email", meta.get("email", None)),
|
||||
("url", "Author website", meta.get("url", None)),
|
||||
("license", "License", meta.get("license", "MIT")),
|
||||
]
|
||||
msg.divider("Generating meta.json")
|
||||
msg.text(
|
||||
"Enter the package settings for your model. The following information "
|
||||
|
@ -111,16 +157,11 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
for setting, desc, default in settings:
|
||||
response = get_raw_input(desc, default)
|
||||
meta[setting] = default if response == "" and default else response
|
||||
if about.__title__ != "spacy":
|
||||
meta["parent_package"] = about.__title__
|
||||
return meta
|
||||
|
||||
|
||||
TEMPLATE_SETUP = """
|
||||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import json
|
||||
from os import path, walk
|
||||
|
@ -166,16 +207,17 @@ def setup_package():
|
|||
|
||||
setup(
|
||||
name=model_name,
|
||||
description=meta['description'],
|
||||
author=meta['author'],
|
||||
author_email=meta['email'],
|
||||
url=meta['url'],
|
||||
description=meta.get('description'),
|
||||
author=meta.get('author'),
|
||||
author_email=meta.get('email'),
|
||||
url=meta.get('url'),
|
||||
version=meta['version'],
|
||||
license=meta['license'],
|
||||
license=meta.get('license'),
|
||||
packages=[model_name],
|
||||
package_data={model_name: list_files(model_dir)},
|
||||
install_requires=list_requirements(meta),
|
||||
zip_safe=False,
|
||||
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
|
||||
)
|
||||
|
||||
|
||||
|
@ -190,9 +232,6 @@ include meta.json
|
|||
|
||||
|
||||
TEMPLATE_INIT = """
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
from spacy.util import load_model_from_init_py, get_model_meta
|
||||
|
||||
|
|
|
@ -1,217 +1,150 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import plac
|
||||
from typing import Optional, Dict, Any
|
||||
import random
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from thinc.v2v import Affine, Maxout
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.neural.util import prefer_gpu
|
||||
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
|
||||
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
|
||||
from thinc.api import CosineDistance, L2Distance
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
from functools import partial
|
||||
import typer
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from ..schemas import ConfigSchema
|
||||
from ..errors import Errors
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
|
||||
from .._ml import masked_language_model, get_cossim_loss, get_characters_loss
|
||||
from .._ml import MultiSoftmax
|
||||
from .. import util
|
||||
from .train import _load_pretrained_tok2vec
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
texts_loc=(
|
||||
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
||||
"key 'tokens'",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
conv_depth=("Depth of CNN layers", "option", "cd", int),
|
||||
cnn_window=("Window size for CNN layers", "option", "cW", int),
|
||||
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
|
||||
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
|
||||
sa_depth=("Depth of self-attention layers", "option", "sa", int),
|
||||
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||
loss_func=(
|
||||
"Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'",
|
||||
"option",
|
||||
"L",
|
||||
str,
|
||||
),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout rate", "option", "d", float),
|
||||
batch_size=("Number of words per training batch", "option", "bs", int),
|
||||
max_length=(
|
||||
"Max words per example. Longer examples are discarded",
|
||||
"option",
|
||||
"xw",
|
||||
int,
|
||||
),
|
||||
min_length=(
|
||||
"Min words per example. Shorter examples are discarded",
|
||||
"option",
|
||||
"nw",
|
||||
int,
|
||||
),
|
||||
seed=("Seed for random number generators", "option", "s", int),
|
||||
n_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
n_save_every=("Save model every X batches.", "option", "se", int),
|
||||
init_tok2vec=(
|
||||
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
||||
"option",
|
||||
"t2v",
|
||||
Path,
|
||||
),
|
||||
epoch_start=(
|
||||
"The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
|
||||
"renamed. Prevents unintended overwriting of existing weight files.",
|
||||
"option",
|
||||
"es",
|
||||
int,
|
||||
),
|
||||
@app.command(
|
||||
"pretrain",
|
||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
||||
)
|
||||
def pretrain(
|
||||
texts_loc,
|
||||
vectors_model,
|
||||
output_dir,
|
||||
width=96,
|
||||
conv_depth=4,
|
||||
cnn_pieces=3,
|
||||
sa_depth=0,
|
||||
cnn_window=1,
|
||||
bilstm_depth=0,
|
||||
use_chars=False,
|
||||
embed_rows=2000,
|
||||
loss_func="cosine",
|
||||
use_vectors=False,
|
||||
dropout=0.2,
|
||||
n_iter=1000,
|
||||
batch_size=3000,
|
||||
max_length=500,
|
||||
min_length=5,
|
||||
seed=0,
|
||||
n_save_every=None,
|
||||
init_tok2vec=None,
|
||||
epoch_start=None,
|
||||
def pretrain_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read additional arguments
|
||||
texts_loc: Path = Arg(..., help="Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", exists=True),
|
||||
output_dir: Path = Arg(..., help="Directory to write models to on each epoch"),
|
||||
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
|
||||
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files."),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pretrained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pretrained weights
|
||||
files to the 'spacy train' command.
|
||||
using an approximate language-modelling objective. Two objective types
|
||||
are available, vector-based and character-based.
|
||||
|
||||
In the vector-based objective, we load word vectors that have been trained
|
||||
using a word2vec-style distributional similarity algorithm, and train a
|
||||
component like a CNN, BiLSTM, etc to predict vectors which match the
|
||||
pretrained ones. The weights are saved to a directory after each epoch. You
|
||||
can then pass a path to one of these pretrained weights files to the
|
||||
'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
all settings are the same between pretraining and training. Ideally,
|
||||
this is done by using the same config file for both commands.
|
||||
"""
|
||||
config = dict(locals())
|
||||
for key in config:
|
||||
if isinstance(config[key], Path):
|
||||
config[key] = str(config[key])
|
||||
util.fix_random_seed(seed)
|
||||
overrides = parse_config_overrides(ctx.args)
|
||||
import_code(code_path)
|
||||
pretrain(
|
||||
texts_loc,
|
||||
output_dir,
|
||||
config_path,
|
||||
config_overrides=overrides,
|
||||
resume_path=resume_path,
|
||||
epoch_resume=epoch_resume,
|
||||
)
|
||||
|
||||
has_gpu = prefer_gpu(gpu_id=1)
|
||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
msg.warn(
|
||||
"Output directory is not empty",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
def pretrain(
|
||||
texts_loc: Path,
|
||||
output_dir: Path,
|
||||
config_path: Path,
|
||||
config_overrides: Dict[str, Any] = {},
|
||||
resume_path: Optional[Path] = None,
|
||||
epoch_resume: Optional[int] = None,
|
||||
):
|
||||
verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume)
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
with show_validation_error():
|
||||
config = util.load_config(
|
||||
config_path,
|
||||
create_objects=False,
|
||||
validate=True,
|
||||
schema=ConfigSchema,
|
||||
overrides=config_overrides,
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good("Created output directory: {}".format(output_dir))
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
|
||||
use_gpu = config["training"]["use_gpu"]
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
require_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
||||
seed = config["pretraining"]["seed"]
|
||||
if seed is not None:
|
||||
fix_random_seed(seed)
|
||||
if use_gpu >= 0 and config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
||||
use_pytorch_for_gpu_memory()
|
||||
|
||||
nlp_config = config["nlp"]
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved config file in the output directory")
|
||||
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
pretrain_config = config["pretraining"]
|
||||
|
||||
# Load texts from file or stdin
|
||||
if texts_loc != "-": # reading from a file
|
||||
texts_loc = Path(texts_loc)
|
||||
if not texts_loc.exists():
|
||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||
with msg.loading("Loading input texts..."):
|
||||
texts = list(srsly.read_jsonl(texts_loc))
|
||||
if not texts:
|
||||
msg.fail("Input file is empty", texts_loc, exits=1)
|
||||
msg.good("Loaded input texts")
|
||||
random.shuffle(texts)
|
||||
else: # reading from stdin
|
||||
msg.text("Reading input text from stdin...")
|
||||
msg.info("Reading input text from stdin...")
|
||||
texts = srsly.read_jsonl("-")
|
||||
|
||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||
nlp = util.load_model(vectors_model)
|
||||
msg.good("Loaded model '{}'".format(vectors_model))
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(
|
||||
nlp,
|
||||
Tok2Vec(
|
||||
width,
|
||||
embed_rows,
|
||||
conv_depth=conv_depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||
subword_features=not use_chars, # Set to False for Chinese etc
|
||||
cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
||||
),
|
||||
objective=loss_func
|
||||
)
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_start = int(model_name.group(0)[5:][:-4]) + 1
|
||||
else:
|
||||
if not epoch_start:
|
||||
msg.fail(
|
||||
"You have to use the '--epoch-start' argument when using a renamed weight file for "
|
||||
"'--init-tok2vec'",
|
||||
exits=True,
|
||||
)
|
||||
elif epoch_start < 0:
|
||||
msg.fail(
|
||||
"The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
|
||||
% epoch_start,
|
||||
exits=True,
|
||||
)
|
||||
else:
|
||||
# Without '--init-tok2vec' the '--epoch-start' argument is ignored
|
||||
epoch_start = 0
|
||||
tok2vec_path = pretrain_config["tok2vec_model"]
|
||||
tok2vec = config
|
||||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
model = create_pretraining_model(nlp, tok2vec, pretrain_config)
|
||||
optimizer = pretrain_config["optimizer"]
|
||||
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
_resume_model(model, resume_path, epoch_resume)
|
||||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
optimizer = create_default_optimizer(model.ops)
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
def _save_model(epoch, is_temp=False):
|
||||
is_temp_str = ".temp" if is_temp else ""
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
|
||||
"wb"
|
||||
) as file_:
|
||||
file_.write(model.tok2vec.to_bytes())
|
||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
|
@ -222,26 +155,26 @@ def pretrain(
|
|||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
skip_counter = 0
|
||||
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||
for batch_id, batch in enumerate(
|
||||
util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
|
||||
):
|
||||
objective = create_objective(pretrain_config["objective"])
|
||||
for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
|
||||
batches = util.minibatch_by_words(texts, size=pretrain_config["batch_size"])
|
||||
for batch_id, batch in enumerate(batches):
|
||||
docs, count = make_docs(
|
||||
nlp,
|
||||
[text for (text, _) in batch],
|
||||
max_length=max_length,
|
||||
min_length=min_length,
|
||||
batch,
|
||||
max_length=pretrain_config["max_length"],
|
||||
min_length=pretrain_config["min_length"],
|
||||
)
|
||||
skip_counter += count
|
||||
loss = make_update(
|
||||
model, docs, optimizer, objective=loss_func, drop=dropout
|
||||
)
|
||||
loss = make_update(model, docs, optimizer, objective)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||
break
|
||||
if n_save_every and (batch_id % n_save_every == 0):
|
||||
if pretrain_config["n_save_every"] and (
|
||||
batch_id % pretrain_config["n_save_every"] == 0
|
||||
):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
|
@ -249,24 +182,36 @@ def pretrain(
|
|||
# Reshuffle the texts if texts were loaded from a file
|
||||
random.shuffle(texts)
|
||||
if skip_counter > 0:
|
||||
msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
|
||||
msg.warn(f"Skipped {skip_counter} empty values")
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
|
||||
def _resume_model(model, resume_path, epoch_resume):
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, objective_func):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
drop (float): The dropout rate.
|
||||
optimizer (callable): An optimizer.
|
||||
RETURNS loss: A float for the loss.
|
||||
"""
|
||||
predictions, backprop = model.begin_update(docs, drop=drop)
|
||||
if objective == "characters":
|
||||
loss, gradients = get_characters_loss(model.ops, docs, predictions)
|
||||
else:
|
||||
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
|
||||
backprop(gradients, sgd=optimizer)
|
||||
predictions, backprop = model.begin_update(docs)
|
||||
loss, gradients = objective_func(model.ops, docs, predictions)
|
||||
backprop(gradients)
|
||||
model.finish_update(optimizer)
|
||||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
|
@ -298,18 +243,43 @@ def make_docs(nlp, batch, min_length, max_length):
|
|||
heads = numpy.asarray(heads, dtype="uint64")
|
||||
heads = heads.reshape((len(doc), 1))
|
||||
doc = doc.from_array([HEAD], heads)
|
||||
if len(doc) >= min_length and len(doc) < max_length:
|
||||
if min_length <= len(doc) < max_length:
|
||||
docs.append(doc)
|
||||
return docs, skip_count
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
||||
"""Compute a mean-squared error loss between the documents' vectors and
|
||||
the prediction.
|
||||
def create_objective(config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
Note that this is ripe for customization! We could compute the vectors
|
||||
in some other word, e.g. with an LSTM language model, or use some other
|
||||
type of objective.
|
||||
We'd like to replace this with a registry function but it's tricky because
|
||||
we're also making a model choice based on this. For now we hard-code support
|
||||
for two types (characters, vectors). For characters you can specify
|
||||
n_characters, for vectors you can specify the loss.
|
||||
|
||||
Bleh.
|
||||
"""
|
||||
objective_type = config["type"]
|
||||
if objective_type == "characters":
|
||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||
elif objective_type == "vectors":
|
||||
if config["loss"] == "cosine":
|
||||
return partial(
|
||||
get_vectors_loss,
|
||||
distance=CosineDistance(normalize=True, ignore_zeros=True),
|
||||
)
|
||||
elif config["loss"] == "L2":
|
||||
return partial(
|
||||
get_vectors_loss, distance=L2Distance(normalize=True, ignore_zeros=True)
|
||||
)
|
||||
else:
|
||||
raise ValueError("Unexpected loss type", config["loss"])
|
||||
else:
|
||||
raise ValueError("Unexpected objective_type", objective_type)
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
|
@ -317,47 +287,51 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
|||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
if objective == "L2":
|
||||
d_target = prediction - target
|
||||
loss = (d_target ** 2).sum()
|
||||
elif objective == "cosine":
|
||||
loss, d_target = get_cossim_loss(prediction, target)
|
||||
else:
|
||||
raise ValueError(Errors.E142.format(loss_func=objective))
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, tok2vec, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
if objective == "characters":
|
||||
out_sizes = [256] * nr_char
|
||||
output_layer = chain(
|
||||
LN(Maxout(300, pieces=3)),
|
||||
MultiSoftmax(out_sizes, 300)
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
if pretrain_config["objective"]["type"] == "vectors":
|
||||
model = build_cloze_multi_task_model(
|
||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
else:
|
||||
output_size = nlp.vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
|
||||
elif pretrain_config["objective"]["type"] == "characters":
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
nlp.vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=pretrain_config["objective"]["n_characters"],
|
||||
)
|
||||
# This is annoying, but the parser etc have the flatten step after
|
||||
# the tok2vec. To load the weights in cleanly, we need to match
|
||||
# the shape of the models' components exactly. So what we cann
|
||||
# "tok2vec" has to be the same set of processes as what the components do.
|
||||
tok2vec = chain(tok2vec, flatten)
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = masked_language_model(nlp.vocab, model)
|
||||
model.tok2vec = tok2vec
|
||||
model.output_layer = output_layer
|
||||
model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
set_dropout_rate(model, pretrain_config["dropout"])
|
||||
return model
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
class ProgressTracker:
|
||||
def __init__(self, frequency=1000000):
|
||||
self.loss = 0.0
|
||||
self.prev_loss = 0.0
|
||||
|
@ -403,3 +377,44 @@ def _smart_round(figure, width=10, max_decimal=4):
|
|||
n_decimal = min(n_decimal, max_decimal)
|
||||
format_str = "%." + str(n_decimal) + "f"
|
||||
return format_str % figure
|
||||
|
||||
|
||||
def verify_cli_args(texts_loc, output_dir, config_path, resume_path, epoch_resume):
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
if resume_path:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"If you're resuming a run from a previous model in this directory, "
|
||||
"the old models for the consecutive epochs will be overwritten "
|
||||
"with the new ones.",
|
||||
)
|
||||
else:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if texts_loc != "-": # reading from a file
|
||||
texts_loc = Path(texts_loc)
|
||||
if not texts_loc.exists():
|
||||
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
|
||||
|
||||
for text in srsly.read_jsonl(texts_loc):
|
||||
break
|
||||
else:
|
||||
msg.fail("Input file is empty", texts_loc, exits=1)
|
||||
|
||||
if resume_path is not None:
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if not model_name and not epoch_resume:
|
||||
msg.fail(
|
||||
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
|
||||
exits=True,
|
||||
)
|
||||
elif not model_name and epoch_resume < 0:
|
||||
msg.fail(
|
||||
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
|
||||
exits=True,
|
||||
)
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
from typing import Optional, Sequence, Union, Iterator
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
@ -9,36 +6,63 @@ import cProfile
|
|||
import pstats
|
||||
import sys
|
||||
import itertools
|
||||
import thinc.extra.datasets
|
||||
from wasabi import msg
|
||||
from wasabi import msg, Printer
|
||||
import typer
|
||||
|
||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
||||
from ..language import Language
|
||||
from ..util import load_model
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model to load", "positional", None, str),
|
||||
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
||||
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
||||
)
|
||||
def profile(model, inputs=None, n_texts=10000):
|
||||
@debug_cli.command("profile")
|
||||
@app.command("profile", hidden=True)
|
||||
def profile_cli(
|
||||
# fmt: off
|
||||
ctx: typer.Context, # This is only used to read current calling context
|
||||
model: str = Arg(..., help="Model to load"),
|
||||
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
||||
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
Input should be formatted as one JSON object per line with a key "text".
|
||||
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
||||
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
||||
"""
|
||||
if ctx.parent.command.name == NAME: # called as top-level command
|
||||
msg.warn(
|
||||
"The profile command is now available via the 'debug profile' "
|
||||
"subcommand. You can run python -m spacy debug --help for an "
|
||||
"overview of the other available debugging commands."
|
||||
)
|
||||
profile(model, inputs=inputs, n_texts=n_texts)
|
||||
|
||||
|
||||
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
||||
|
||||
if inputs is not None:
|
||||
inputs = _read_inputs(inputs, msg)
|
||||
if inputs is None:
|
||||
try:
|
||||
import ml_datasets
|
||||
except ImportError:
|
||||
msg.fail(
|
||||
"This command, when run without an input file, "
|
||||
"requires the ml_datasets library to be installed: "
|
||||
"pip install ml_datasets",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
n_inputs = 25000
|
||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
imdb_train, _ = ml_datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||
inputs = inputs[:n_inputs]
|
||||
with msg.loading("Loading model '{}'...".format(model)):
|
||||
with msg.loading(f"Loading model '{model}'..."):
|
||||
nlp = load_model(model)
|
||||
msg.good("Loaded model '{}'".format(model))
|
||||
msg.good(f"Loaded model '{model}'")
|
||||
texts = list(itertools.islice(inputs, n_texts))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
|
@ -46,12 +70,12 @@ def profile(model, inputs=None, n_texts=10000):
|
|||
s.strip_dirs().sort_stats("time").print_stats()
|
||||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||
pass
|
||||
|
||||
|
||||
def _read_inputs(loc, msg):
|
||||
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
|
||||
if loc == "-":
|
||||
msg.info("Reading input from sys.stdin")
|
||||
file_ = sys.stdin
|
||||
|
@ -60,7 +84,7 @@ def _read_inputs(loc, msg):
|
|||
input_path = Path(loc)
|
||||
if not input_path.exists() or not input_path.is_file():
|
||||
msg.fail("Not a valid input data file", loc, exits=1)
|
||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||
msg.info(f"Using data from {input_path.parts[-1]}")
|
||||
file_ = input_path.open()
|
||||
for line in file_:
|
||||
data = srsly.json_loads(line)
|
||||
|
|
157
spacy/cli/project/assets.py
Normal file
157
spacy/cli/project/assets.py
Normal file
|
@ -0,0 +1,157 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import requests
|
||||
import tqdm
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, PROJECT_FILE, load_project_config, get_checksum
|
||||
|
||||
|
||||
# TODO: find a solution for caches
|
||||
# CACHES = [
|
||||
# Path.home() / ".torch",
|
||||
# Path.home() / ".caches" / "torch",
|
||||
# os.environ.get("TORCH_HOME"),
|
||||
# Path.home() / ".keras",
|
||||
# ]
|
||||
|
||||
|
||||
@project_cli.command("assets")
|
||||
def project_assets_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
# fmt: on
|
||||
):
|
||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
||||
defined in the "assets" section of the project.yml. If a checksum is
|
||||
provided in the project.yml, the file is only downloaded if no local file
|
||||
with the same checksum exists.
|
||||
"""
|
||||
project_assets(project_dir)
|
||||
|
||||
|
||||
def project_assets(project_dir: Path) -> None:
|
||||
"""Fetch assets for a project using DVC if possible.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
"""
|
||||
project_path = ensure_path(project_dir)
|
||||
config = load_project_config(project_path)
|
||||
assets = config.get("assets", {})
|
||||
if not assets:
|
||||
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
|
||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
||||
variables = config.get("variables", {})
|
||||
for asset in assets:
|
||||
dest = asset["dest"].format(**variables)
|
||||
url = asset.get("url")
|
||||
checksum = asset.get("checksum")
|
||||
if not url:
|
||||
# project.yml defines asset without URL that the user has to place
|
||||
check_private_asset(dest, checksum)
|
||||
continue
|
||||
url = url.format(**variables)
|
||||
fetch_asset(project_path, url, dest, checksum)
|
||||
|
||||
|
||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
||||
"""Check and validate assets without a URL (private assets that the user
|
||||
has to provide themselves) and give feedback about the checksum.
|
||||
|
||||
dest (Path): Desintation path of the asset.
|
||||
checksum (Optional[str]): Optional checksum of the expected file.
|
||||
"""
|
||||
if not Path(dest).exists():
|
||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
||||
msg.warn(err)
|
||||
else:
|
||||
if checksum and checksum == get_checksum(dest):
|
||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
||||
else:
|
||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
||||
|
||||
|
||||
def fetch_asset(
|
||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
||||
) -> None:
|
||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
||||
|
||||
project_path (Path): Path to project directory.
|
||||
url (str): URL or path to asset.
|
||||
checksum (Optional[str]): Optional expected checksum of local file.
|
||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
||||
the asset failed.
|
||||
"""
|
||||
# TODO: add support for caches
|
||||
dest_path = (project_path / dest).resolve()
|
||||
if dest_path.exists() and checksum:
|
||||
# If there's already a file, check for checksum
|
||||
if checksum == get_checksum(dest_path):
|
||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
||||
return dest_path
|
||||
# We might as well support the user here and create parent directories in
|
||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
||||
if not dest_path.parent.exists():
|
||||
dest_path.parent.mkdir(parents=True)
|
||||
with working_dir(project_path):
|
||||
url = convert_asset_url(url)
|
||||
try:
|
||||
download_file(url, dest_path)
|
||||
msg.good(f"Downloaded asset {dest}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
if Path(url).exists() and Path(url).is_file():
|
||||
# If it's a local file, copy to destination
|
||||
shutil.copy(url, str(dest_path))
|
||||
msg.good(f"Copied local asset {dest}")
|
||||
else:
|
||||
msg.fail(f"Download failed: {dest}", e)
|
||||
return
|
||||
if checksum and checksum != get_checksum(dest_path):
|
||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
||||
|
||||
|
||||
def convert_asset_url(url: str) -> str:
|
||||
"""Check and convert the asset URL if needed.
|
||||
|
||||
url (str): The asset URL.
|
||||
RETURNS (str): The converted URL.
|
||||
"""
|
||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
||||
if re.match(r"(http(s?)):\/\/github.com", url):
|
||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
||||
msg.warn(
|
||||
"Downloading from a regular GitHub URL. This will only download "
|
||||
"the source of the page, not the actual file. Converting the URL "
|
||||
"to a raw URL.",
|
||||
converted,
|
||||
)
|
||||
return converted
|
||||
return url
|
||||
|
||||
|
||||
def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
|
||||
"""Download a file using requests.
|
||||
|
||||
url (str): The URL of the file.
|
||||
dest (Path): The destination path.
|
||||
chunk_size (int): The size of chunks to read/write.
|
||||
"""
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
total = int(response.headers.get("content-length", 0))
|
||||
progress_settings = {
|
||||
"total": total,
|
||||
"unit": "iB",
|
||||
"unit_scale": True,
|
||||
"unit_divisor": chunk_size,
|
||||
"leave": False,
|
||||
}
|
||||
with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
|
||||
for data in response.iter_content(chunk_size=chunk_size):
|
||||
size = f.write(data)
|
||||
bar.update(size)
|
96
spacy/cli/project/clone.py
Normal file
96
spacy/cli/project/clone.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import shutil
|
||||
import re
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path, run_command, make_tempdir
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
|
||||
|
||||
@project_cli.command("clone")
|
||||
def project_clone_cli(
|
||||
# fmt: off
|
||||
name: str = Arg(..., help="The name of the template to clone"),
|
||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
||||
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Clone a project template from a repository. Calls into "git" and will
|
||||
only download the files from the given subdirectory. The GitHub repo
|
||||
defaults to the official spaCy template repo, but can be customized
|
||||
(including using a private repo).
|
||||
"""
|
||||
if dest is None:
|
||||
dest = Path.cwd() / name
|
||||
project_clone(name, dest, repo=repo)
|
||||
|
||||
|
||||
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None:
|
||||
"""Clone a project template from a repository.
|
||||
|
||||
name (str): Name of subdirectory to clone.
|
||||
dest (Path): Destination path of cloned project.
|
||||
repo (str): URL of Git repo containing project templates.
|
||||
"""
|
||||
dest = ensure_path(dest)
|
||||
check_clone(name, dest, repo)
|
||||
project_dir = dest.resolve()
|
||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
||||
# We're using Git and sparse checkout to only clone the files we need
|
||||
with make_tempdir() as tmp_dir:
|
||||
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
|
||||
try:
|
||||
run_command(cmd)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone the repo '{repo}' into the temp dir '{tmp_dir}'."
|
||||
msg.fail(err)
|
||||
with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
|
||||
f.write(name)
|
||||
try:
|
||||
run_command(["git", "-C", str(tmp_dir), "fetch"])
|
||||
run_command(["git", "-C", str(tmp_dir), "checkout"])
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
msg.fail(err)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(name)), str(project_dir))
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
else:
|
||||
msg.good(f"Your project is now ready!")
|
||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
||||
|
||||
|
||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
||||
"""Check and validate that the destination path can be used to clone. Will
|
||||
check that Git is available and that the destination path is suitable.
|
||||
|
||||
name (str): Name of the directory to clone from the repo.
|
||||
dest (Path): Local destination of cloned directory.
|
||||
repo (str): URL of the repo to clone from.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
f"Cloning spaCy project templates requires Git and the 'git' command. ",
|
||||
f"To clone a project without Git, copy the files from the '{name}' "
|
||||
f"directory in the {repo} to {dest} manually and then run:",
|
||||
f"{COMMAND} project init {dest}",
|
||||
exits=1,
|
||||
)
|
||||
if not dest:
|
||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
||||
if dest.exists():
|
||||
# Directory already exists (not allowed, clone needs to create it)
|
||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
||||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
exits=1,
|
||||
)
|
208
spacy/cli/project/dvc.py
Normal file
208
spacy/cli/project/dvc.py
Normal file
|
@ -0,0 +1,208 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
||||
from .._util import Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
UPDATE_COMMAND = "dvc"
|
||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
||||
|
||||
|
||||
@project_cli.command(UPDATE_COMMAND)
|
||||
def project_update_dvc_cli(
|
||||
# fmt: off
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. If no workflow is specified, the first defined
|
||||
workflow is used. The DVC config will only be updated if the project.yml changed.
|
||||
"""
|
||||
project_update_dvc(project_dir, workflow, verbose=verbose, force=force)
|
||||
|
||||
|
||||
def project_update_dvc(
|
||||
project_dir: Path,
|
||||
workflow: Optional[str] = None,
|
||||
*,
|
||||
verbose: bool = False,
|
||||
force: bool = False,
|
||||
) -> None:
|
||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
||||
project can only define one pipeline, so you need to specify one workflow
|
||||
defined in the project.yml. Will only update the file if the checksum changed.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
||||
If not set, the first workflow will be used.
|
||||
verbose (bool): Print more info.
|
||||
force (bool): Force update DVC config.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
updated = update_dvc_config(
|
||||
project_dir, config, workflow, verbose=verbose, force=force
|
||||
)
|
||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
||||
if updated:
|
||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
||||
else:
|
||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
||||
|
||||
|
||||
def update_dvc_config(
|
||||
path: Path,
|
||||
config: Dict[str, Any],
|
||||
workflow: Optional[str] = None,
|
||||
verbose: bool = False,
|
||||
silent: bool = False,
|
||||
force: bool = False,
|
||||
) -> bool:
|
||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
||||
project directory. The file is auto-generated based on the config. The
|
||||
first line of the auto-generated file specifies the hash of the config
|
||||
dict, so if any of the config values change, the DVC config is regenerated.
|
||||
|
||||
path (Path): The path to the project directory.
|
||||
config (Dict[str, Any]): The loaded project.yml.
|
||||
verbose (bool): Whether to print additional info (via DVC).
|
||||
silent (bool): Don't output anything (via DVC).
|
||||
force (bool): Force update, even if hashes match.
|
||||
RETURNS (bool): Whether the DVC config file was updated.
|
||||
"""
|
||||
ensure_dvc(path)
|
||||
workflows = config.get("workflows", {})
|
||||
workflow_names = list(workflows.keys())
|
||||
check_workflows(workflow_names, workflow)
|
||||
if not workflow:
|
||||
workflow = workflow_names[0]
|
||||
config_hash = get_hash(config)
|
||||
path = path.resolve()
|
||||
dvc_config_path = path / DVC_CONFIG
|
||||
if dvc_config_path.exists():
|
||||
# Check if the file was generated using the current config, if not, redo
|
||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
||||
ref_hash = f.readline().strip().replace("# ", "")
|
||||
if ref_hash == config_hash and not force:
|
||||
return False # Nothing has changed in project.yml, don't need to update
|
||||
dvc_config_path.unlink()
|
||||
variables = config.get("variables", {})
|
||||
dvc_commands = []
|
||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
for name in workflows[workflow]:
|
||||
command = config_commands[name]
|
||||
deps = command.get("deps", [])
|
||||
outputs = command.get("outputs", [])
|
||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
||||
if not deps and not outputs and not outputs_no_cache:
|
||||
continue
|
||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
||||
# and we don't want arbitrary paths in there
|
||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
||||
dvc_cmd = ["run", "-n", name, "-w", str(path), "--no-exec"]
|
||||
if command.get("no_skip"):
|
||||
dvc_cmd.append("--always-changed")
|
||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
||||
dvc_commands.append(join_command(full_cmd))
|
||||
with working_dir(path):
|
||||
dvc_flags = {"--verbose": verbose, "--quiet": silent}
|
||||
run_dvc_commands(dvc_commands, variables, flags=dvc_flags)
|
||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
||||
content = f.read()
|
||||
f.seek(0, 0)
|
||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
||||
return True
|
||||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, str] = {},
|
||||
flags: Dict[str, bool] = {},
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands without the leading "dvc".
|
||||
variables (Dict[str, str]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
flags (Dict[str, bool]): Conditional flags to be added to command. Makes it
|
||||
easier to pass flags like --quiet that depend on a variable or
|
||||
command-line setting while avoiding lots of nested conditionals.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
dvc_command = ["dvc", *command]
|
||||
# Add the flags if they are set to True
|
||||
for flag, is_active in flags.items():
|
||||
if is_active:
|
||||
dvc_command.append(flag)
|
||||
run_command(dvc_command)
|
||||
|
||||
|
||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
||||
"""Validate workflows provided in project.yml and check that a given
|
||||
workflow can be used to generate a DVC config.
|
||||
|
||||
workflows (List[str]): Names of the available workflows.
|
||||
workflow (Optional[str]): The name of the workflow to convert.
|
||||
"""
|
||||
if not workflows:
|
||||
msg.fail(
|
||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
||||
f"define at least one list of commands.",
|
||||
exits=1,
|
||||
)
|
||||
if workflow is not None and workflow not in workflows:
|
||||
msg.fail(
|
||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
||||
f"Available workflows: {', '.join(workflows)}",
|
||||
exits=1,
|
||||
)
|
||||
if not workflow:
|
||||
msg.warn(
|
||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
||||
)
|
||||
|
||||
|
||||
def ensure_dvc(project_dir: Path) -> None:
|
||||
"""Ensure that the "dvc" command is available and that the current project
|
||||
directory is an initialized DVC project.
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
||||
except Exception:
|
||||
msg.fail(
|
||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
||||
"to be installed and the 'dvc' command needs to be available",
|
||||
"You can install the Python package from pip (pip install dvc) or "
|
||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
||||
"documentation: https://dvc.org/doc/install",
|
||||
exits=1,
|
||||
)
|
||||
if not (project_dir / ".dvc").exists():
|
||||
msg.fail(
|
||||
"Project not initialized as a DVC project",
|
||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
||||
"directory. For more details, see the documentation: "
|
||||
"https://dvc.org/doc/command-reference/init",
|
||||
exits=1,
|
||||
)
|
265
spacy/cli/project/run.py
Normal file
265
spacy/cli/project/run.py
Normal file
|
@ -0,0 +1,265 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import sys
|
||||
import srsly
|
||||
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
||||
|
||||
|
||||
@project_cli.command("run")
|
||||
def project_run_cli(
|
||||
# fmt: off
|
||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
||||
# fmt: on
|
||||
):
|
||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
||||
name is specified, all commands in the workflow are run, in order. If
|
||||
commands define dependencies and/or outputs, they will only be re-run if
|
||||
state has changed.
|
||||
"""
|
||||
if show_help or not subcommand:
|
||||
print_run_help(project_dir, subcommand)
|
||||
else:
|
||||
project_run(project_dir, subcommand, force=force, dry=dry)
|
||||
|
||||
|
||||
def project_run(
|
||||
project_dir: Path, subcommand: str, *, force: bool = False, dry: bool = False
|
||||
) -> None:
|
||||
"""Run a named script defined in the project.yml. If the script is part
|
||||
of the default pipeline (defined in the "run" section), DVC is used to
|
||||
execute the command, so it can determine whether to rerun it. It then
|
||||
calls into "exec" to execute it.
|
||||
|
||||
project_dir (Path): Path to project directory.
|
||||
subcommand (str): Name of command to run.
|
||||
force (bool): Force re-running, even if nothing changed.
|
||||
dry (bool): Perform a dry run and don't execute commands.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
variables = config.get("variables", {})
|
||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
||||
workflows = config.get("workflows", {})
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
if subcommand in workflows:
|
||||
msg.info(f"Running workflow '{subcommand}'")
|
||||
for cmd in workflows[subcommand]:
|
||||
project_run(project_dir, cmd, force=force, dry=dry)
|
||||
else:
|
||||
cmd = commands[subcommand]
|
||||
variables = config.get("variables", {})
|
||||
for dep in cmd.get("deps", []):
|
||||
dep = dep.format(**variables)
|
||||
if not (project_dir / dep).exists():
|
||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
||||
err_kwargs = {"exits": 1} if not dry else {}
|
||||
msg.fail(err, **err_kwargs)
|
||||
with working_dir(project_dir) as current_dir:
|
||||
rerun = check_rerun(current_dir, cmd, variables)
|
||||
if not rerun and not force:
|
||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
||||
else:
|
||||
msg.divider(subcommand)
|
||||
run_commands(cmd["script"], variables, dry=dry)
|
||||
if not dry:
|
||||
update_lockfile(current_dir, cmd, variables)
|
||||
|
||||
|
||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
||||
|
||||
project_dir (Path): The project directory.
|
||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
||||
and a list of available commands is printed.
|
||||
"""
|
||||
config = load_project_config(project_dir)
|
||||
config_commands = config.get("commands", [])
|
||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
||||
workflows = config.get("workflows", {})
|
||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
||||
if subcommand:
|
||||
validate_subcommand(commands.keys(), workflows.keys(), subcommand)
|
||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
||||
if subcommand in commands:
|
||||
help_text = commands[subcommand].get("help")
|
||||
if help_text:
|
||||
print(f"\n{help_text}\n")
|
||||
elif subcommand in workflows:
|
||||
steps = workflows[subcommand]
|
||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
||||
steps_data = [
|
||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
||||
for i, step in enumerate(steps)
|
||||
]
|
||||
msg.table(steps_data)
|
||||
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
||||
print(f"For command details, run: {help_cmd}")
|
||||
else:
|
||||
print("")
|
||||
if config_commands:
|
||||
print(f"Available commands in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
||||
if workflows:
|
||||
print(f"Available workflows in {PROJECT_FILE}")
|
||||
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
||||
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
||||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(),
|
||||
variables: Dict[str, Any] = {},
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
commands (List[str]): The string commands.
|
||||
variables (Dict[str, Any]): Dictionary of variable names, mapped to their
|
||||
values. Will be used to substitute format string variables in the
|
||||
commands.
|
||||
silent (bool): Don't print the commands.
|
||||
dry (bool): Perform a dry run and don't execut anything.
|
||||
"""
|
||||
for command in commands:
|
||||
# Substitute variables, e.g. "./{NAME}.json"
|
||||
command = command.format(**variables)
|
||||
command = split_command(command)
|
||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
||||
# use commands in their config that reference "python" and we want to
|
||||
# make sure that it's always executing the same Python that spaCy is
|
||||
# executed with and the pip in the same env, not some other Python/pip.
|
||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
||||
# that's how it's set up on their system), and user 2 without the
|
||||
# shortcut tries to re-run the command.
|
||||
if len(command) and command[0] in ("python", "python3"):
|
||||
command[0] = sys.executable
|
||||
elif len(command) and command[0] in ("pip", "pip3"):
|
||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
||||
if not silent:
|
||||
print(f"Running command: {join_command(command)}")
|
||||
if not dry:
|
||||
run_command(command)
|
||||
|
||||
|
||||
def validate_subcommand(
|
||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
||||
) -> None:
|
||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
||||
|
||||
commands (Sequence[str]): The available commands.
|
||||
subcommand (str): The subcommand.
|
||||
"""
|
||||
if not commands and not workflows:
|
||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
||||
if subcommand not in commands and subcommand not in workflows:
|
||||
help_msg = []
|
||||
if commands:
|
||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
||||
if workflows:
|
||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
||||
msg.fail(
|
||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
||||
". ".join(help_msg),
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def check_rerun(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> bool:
|
||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
||||
changed.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (bool): Whether to re-run the command.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
||||
return True
|
||||
data = srsly.read_yaml(lock_path)
|
||||
if command["name"] not in data: # We don't have info about this command
|
||||
return True
|
||||
entry = data[command["name"]]
|
||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
||||
if not entry.get("outs", []):
|
||||
return True
|
||||
# If the entry in the lockfile matches the lockfile entry that would be
|
||||
# generated from the current command, we don't rerun because it means that
|
||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
||||
return get_hash(get_lock_entry(project_dir, command, variables)) != get_hash(entry)
|
||||
|
||||
|
||||
def update_lockfile(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> None:
|
||||
"""Update the lockfile after running a command. Will create a lockfile if
|
||||
it doesn't yet exist and will add an entry for the current command, its
|
||||
script and dependencies/outputs.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
"""
|
||||
lock_path = project_dir / PROJECT_LOCK
|
||||
if not lock_path.exists():
|
||||
srsly.write_yaml(lock_path, {})
|
||||
data = {}
|
||||
else:
|
||||
data = srsly.read_yaml(lock_path)
|
||||
data[command["name"]] = get_lock_entry(project_dir, command, variables)
|
||||
srsly.write_yaml(lock_path, data)
|
||||
|
||||
|
||||
def get_lock_entry(
|
||||
project_dir: Path, command: Dict[str, Any], variables: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
||||
the script (command steps) and a list of dependencies and outputs with
|
||||
their paths and file hashes, if available. The format is based on the
|
||||
dvc.lock files, to keep things consistent.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
||||
"""
|
||||
deps = get_fileinfo(project_dir, command.get("deps", []), variables)
|
||||
outs = get_fileinfo(project_dir, command.get("outputs", []), variables)
|
||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []), variables)
|
||||
return {
|
||||
"cmd": f"{COMMAND} run {command['name']}",
|
||||
"script": command["script"],
|
||||
"deps": deps,
|
||||
"outs": [*outs, *outs_nc],
|
||||
}
|
||||
|
||||
|
||||
def get_fileinfo(
|
||||
project_dir: Path, paths: List[str], variables: Dict[str, Any]
|
||||
) -> List[Dict[str, str]]:
|
||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
||||
Includes the file path and the file's checksum.
|
||||
|
||||
project_dir (Path): The current project directory.
|
||||
paths (List[str]): The file paths.
|
||||
variables (Dict[str, Any]): The variables defined in the project.yml.
|
||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
||||
"""
|
||||
data = []
|
||||
for path in paths:
|
||||
path = path.format(**variables)
|
||||
file_path = project_dir / path
|
||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
||||
data.append({"path": path, "md5": md5})
|
||||
return data
|
1225
spacy/cli/train.py
1225
spacy/cli/train.py
File diff suppressed because it is too large
Load Diff
|
@ -1,67 +1,49 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from typing import Tuple
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import requests
|
||||
import srsly
|
||||
from wasabi import msg
|
||||
from wasabi import msg, Printer
|
||||
|
||||
from ..compat import path2str
|
||||
from ..util import get_data_path
|
||||
from ._util import app
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, get_base_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||
|
||||
|
||||
def validate():
|
||||
@app.command("validate")
|
||||
def validate_cli():
|
||||
"""
|
||||
Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
"Server error ({})".format(r.status_code),
|
||||
"Couldn't fetch compatibility table.",
|
||||
exits=1,
|
||||
)
|
||||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
version = about.__version__
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
current_compat = compat.get(version)
|
||||
validate()
|
||||
|
||||
|
||||
def validate() -> None:
|
||||
model_pkgs, compat = get_model_pkgs()
|
||||
spacy_version = get_base_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
if not current_compat:
|
||||
msg.fail(
|
||||
"Can't find spaCy v{} in compatibility table".format(version),
|
||||
about.__compatibility__,
|
||||
exits=1,
|
||||
)
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
|
||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||
incompat_models.update(
|
||||
[d["name"] for _, d in model_links.items() if not d["compat"]]
|
||||
)
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
spacy_dir = Path(__file__).parent.parent
|
||||
|
||||
msg.divider("Installed models (spaCy v{})".format(about.__version__))
|
||||
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
||||
msg.info(f"spaCy installation: {spacy_dir}")
|
||||
|
||||
if model_links or model_pkgs:
|
||||
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||
if model_pkgs:
|
||||
header = ("NAME", "SPACY", "VERSION", "")
|
||||
rows = []
|
||||
for name, data in model_pkgs.items():
|
||||
rows.append(get_model_row(current_compat, name, data, msg))
|
||||
for name, data in model_links.items():
|
||||
rows.append(get_model_row(current_compat, name, data, msg, "link"))
|
||||
if data["compat"]:
|
||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||
rows.append((data["name"], data["spacy"], version, comp))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
msg.text("No models found in your current environment.", exits=0)
|
||||
|
@ -71,78 +53,55 @@ def validate():
|
|||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
msg.text(
|
||||
"The following models are not available for spaCy "
|
||||
"v{}: {}".format(about.__version__, ", ".join(na_models))
|
||||
msg.info(
|
||||
f"The following models are custom spaCy models or not "
|
||||
f"available for spaCy v{about.__version__}:",
|
||||
", ".join(na_models),
|
||||
)
|
||||
if incompat_links:
|
||||
msg.text(
|
||||
"You may also want to overwrite the incompatible links using the "
|
||||
"`python -m spacy link` command with `--force`, or remove them "
|
||||
"from the data directory. "
|
||||
"Data path: {path}".format(path=path2str(get_data_path()))
|
||||
)
|
||||
if incompat_models or incompat_links:
|
||||
if incompat_models:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def get_model_links(compat):
|
||||
links = {}
|
||||
data_path = get_data_path()
|
||||
if data_path:
|
||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||
for model in models:
|
||||
meta_path = Path(model) / "meta.json"
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
meta = srsly.read_json(meta_path)
|
||||
link = model.parts[-1]
|
||||
name = meta["lang"] + "_" + meta["name"]
|
||||
links[link] = {
|
||||
"name": name,
|
||||
"version": meta["version"],
|
||||
"compat": is_compat(compat, name, meta["version"]),
|
||||
}
|
||||
return links
|
||||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
import pkg_resources
|
||||
|
||||
def get_model_pkgs(silent: bool = False) -> Tuple[dict, dict]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
f"Server error ({r.status_code})",
|
||||
"Couldn't fetch compatibility table.",
|
||||
exits=1,
|
||||
)
|
||||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
all_models = set()
|
||||
installed_models = get_installed_models()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
for pkg_name in installed_models:
|
||||
package = pkg_name.replace("-", "_")
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"compat": is_compat(compat, package, version),
|
||||
}
|
||||
return pkgs
|
||||
version = get_package_version(pkg_name)
|
||||
if package in compat:
|
||||
is_compat = version in compat[package]
|
||||
spacy_version = about.__version__
|
||||
else:
|
||||
model_path = get_package_path(package)
|
||||
model_meta = get_model_meta(model_path)
|
||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"spacy": spacy_version,
|
||||
"compat": is_compat,
|
||||
}
|
||||
return pkgs, compat
|
||||
|
||||
|
||||
def get_model_row(compat, name, data, msg, model_type="package"):
|
||||
if data["compat"]:
|
||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||
return (model_type, name, data["name"], version, comp)
|
||||
|
||||
|
||||
def is_model_path(model_path):
|
||||
exclude = ["cache", "pycache", "__pycache__"]
|
||||
name = model_path.parts[-1]
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith(".")
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
return name in compat and version in compat[name]
|
||||
|
||||
|
||||
def reformat_version(version):
|
||||
def reformat_version(version: str) -> str:
|
||||
"""Hack to reformat old versions ending on '-alpha' to match pip format."""
|
||||
if version.endswith("-alpha"):
|
||||
return version.replace("-alpha", "a0")
|
||||
|
|
129
spacy/compat.py
129
spacy/compat.py
|
@ -1,4 +1,3 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
Helpers for Python and platform compatibility. To distinguish them from
|
||||
the builtin functions, replacement functions are suffixed with an underscore,
|
||||
|
@ -6,15 +5,9 @@ e.g. `unicode_`.
|
|||
|
||||
DOCS: https://spacy.io/api/top-level#compat
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import itertools
|
||||
import ast
|
||||
import types
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
from thinc.util import copy_array
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
|
@ -36,91 +29,23 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try:
|
||||
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||
except ImportError:
|
||||
from thinc.neural.optimizers import Adam as Optimizer # noqa: F401
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
copy_reg = copy_reg
|
||||
CudaStream = CudaStream
|
||||
cupy = cupy
|
||||
copy_array = copy_array
|
||||
izip = getattr(itertools, "izip", zip)
|
||||
|
||||
is_windows = sys.platform.startswith("win")
|
||||
is_linux = sys.platform.startswith("linux")
|
||||
is_osx = sys.platform == "darwin"
|
||||
|
||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
||||
is_python2 = sys.version_info[0] == 2
|
||||
is_python3 = sys.version_info[0] == 3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
||||
|
||||
if is_python2:
|
||||
bytes_ = str
|
||||
unicode_ = unicode # noqa: F821
|
||||
basestring_ = basestring # noqa: F821
|
||||
input_ = raw_input # noqa: F821
|
||||
path2str = lambda path: str(path).decode("utf8")
|
||||
class_types = (type, types.ClassType)
|
||||
|
||||
elif is_python3:
|
||||
bytes_ = bytes
|
||||
unicode_ = str
|
||||
basestring_ = str
|
||||
input_ = input
|
||||
path2str = lambda path: str(path)
|
||||
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
|
||||
|
||||
|
||||
def b_to_str(b_str):
|
||||
"""Convert a bytes object to a string.
|
||||
|
||||
b_str (bytes): The object to convert.
|
||||
RETURNS (unicode): The converted string.
|
||||
"""
|
||||
if is_python2:
|
||||
return b_str
|
||||
# Important: if no encoding is set, string becomes "b'...'"
|
||||
return str(b_str, encoding="utf8")
|
||||
|
||||
|
||||
def symlink_to(orig, dest):
|
||||
"""Create a symlink. Used for model shortcut links.
|
||||
|
||||
orig (unicode / Path): The origin path.
|
||||
dest (unicode / Path): The destination path of the symlink.
|
||||
"""
|
||||
if is_windows:
|
||||
import subprocess
|
||||
|
||||
subprocess.check_call(
|
||||
["mklink", "/d", path2str(orig), path2str(dest)], shell=True
|
||||
)
|
||||
else:
|
||||
orig.symlink_to(dest)
|
||||
|
||||
|
||||
def symlink_remove(link):
|
||||
"""Remove a symlink. Used for model shortcut links.
|
||||
|
||||
link (unicode / Path): The path to the symlink.
|
||||
"""
|
||||
# https://stackoverflow.com/q/26554135/6400719
|
||||
if os.path.isdir(path2str(link)) and is_windows:
|
||||
# this should only be on Py2.7 and windows
|
||||
os.rmdir(path2str(link))
|
||||
else:
|
||||
os.unlink(path2str(link))
|
||||
|
||||
|
||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||
def is_config(windows=None, linux=None, osx=None, **kwargs):
|
||||
"""Check if a specific configuration of Python version and operating system
|
||||
matches the user's setup. Mostly used to display targeted error messages.
|
||||
|
||||
python2 (bool): spaCy is executed with Python 2.x.
|
||||
python3 (bool): spaCy is executed with Python 3.x.
|
||||
windows (bool): spaCy is executed on Windows.
|
||||
linux (bool): spaCy is executed on Linux.
|
||||
osx (bool): spaCy is executed on OS X or macOS.
|
||||
|
@ -129,53 +54,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
|||
DOCS: https://spacy.io/api/top-level#compat.is_config
|
||||
"""
|
||||
return (
|
||||
python2 in (None, is_python2)
|
||||
and python3 in (None, is_python3)
|
||||
and windows in (None, is_windows)
|
||||
windows in (None, is_windows)
|
||||
and linux in (None, is_linux)
|
||||
and osx in (None, is_osx)
|
||||
)
|
||||
|
||||
|
||||
def import_file(name, loc):
|
||||
"""Import module from a file. Used to load models from a directory.
|
||||
|
||||
name (unicode): Name of module to load.
|
||||
loc (unicode / Path): Path to the file.
|
||||
RETURNS: The loaded module.
|
||||
"""
|
||||
loc = path2str(loc)
|
||||
if is_python_pre_3_5:
|
||||
import imp
|
||||
|
||||
return imp.load_source(name, loc)
|
||||
else:
|
||||
import importlib.util
|
||||
|
||||
spec = importlib.util.spec_from_file_location(name, str(loc))
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def unescape_unicode(string):
|
||||
"""Python2.7's re module chokes when compiling patterns that have ranges
|
||||
between escaped unicode codepoints if the two codepoints are unrecognised
|
||||
in the unicode database. For instance:
|
||||
|
||||
re.compile('[\\uAA77-\\uAA79]').findall("hello")
|
||||
|
||||
Ends up matching every character (on Python 2). This problem doesn't occur
|
||||
if we're dealing with unicode literals.
|
||||
"""
|
||||
if string is None:
|
||||
return string
|
||||
# We only want to unescape the unicode, so we first must protect the other
|
||||
# backslashes.
|
||||
string = string.replace("\\", "\\\\")
|
||||
# Now we remove that protection for the unicode.
|
||||
string = string.replace("\\\\u", "\\u")
|
||||
string = string.replace("\\\\U", "\\U")
|
||||
# Now we unescape by evaling the string with the AST. This can't execute
|
||||
# code -- it only does the representational level.
|
||||
return ast.literal_eval("u'''" + string + "'''")
|
||||
|
|
|
@ -1,17 +1,13 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
spaCy's built in visualization suite for dependencies and named entities.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import warnings
|
||||
|
||||
from .render import DependencyRenderer, EntityRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..compat import b_to_str
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
|
||||
|
@ -26,13 +22,13 @@ def render(
|
|||
"""Render displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
style (str): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -77,13 +73,13 @@ def serve(
|
|||
"""Serve displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
style (str): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (unicode): Host to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -95,20 +91,20 @@ def serve(
|
|||
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server(host, port, app)
|
||||
print("\nUsing the '{}' visualizer".format(style))
|
||||
print("Serving on http://{}:{} ...\n".format(host, port))
|
||||
print(f"\nUsing the '{style}' visualizer")
|
||||
print(f"Serving on http://{host}:{port} ...\n")
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
print("Shutting down server on port {}.".format(port))
|
||||
print(f"Shutting down server on port {port}.")
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
# Headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
||||
start_response(b_to_str(b"200 OK"), headers)
|
||||
headers = [("Content-type", "text/html; charset=utf-8")]
|
||||
start_response("200 OK", headers)
|
||||
res = _html["parsed"].encode(encoding="utf-8")
|
||||
return [res]
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import uuid
|
||||
|
||||
from .templates import (
|
||||
|
@ -19,7 +16,7 @@ DEFAULT_LANG = "en"
|
|||
DEFAULT_DIR = "ltr"
|
||||
|
||||
|
||||
class DependencyRenderer(object):
|
||||
class DependencyRenderer:
|
||||
"""Render dependency parses as SVGs."""
|
||||
|
||||
style = "dep"
|
||||
|
@ -50,7 +47,7 @@ class DependencyRenderer(object):
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered SVG or HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
# Create a random ID prefix to make sure parses don't receive the
|
||||
# same ID, even if they're identical
|
||||
|
@ -61,7 +58,7 @@ class DependencyRenderer(object):
|
|||
settings = p.get("settings", {})
|
||||
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||
render_id = "{}-{}".format(id_prefix, i)
|
||||
render_id = f"{id_prefix}-{i}"
|
||||
svg = self.render_svg(render_id, p["words"], p["arcs"])
|
||||
rendered.append(svg)
|
||||
if page:
|
||||
|
@ -81,7 +78,7 @@ class DependencyRenderer(object):
|
|||
render_id (int): Unique ID, typically index of document.
|
||||
words (list): Individual words and their tags.
|
||||
arcs (list): Individual arcs and their start, end, direction and label.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
self.levels = self.get_levels(arcs)
|
||||
self.highest_level = len(self.levels)
|
||||
|
@ -115,10 +112,10 @@ class DependencyRenderer(object):
|
|||
):
|
||||
"""Render individual word.
|
||||
|
||||
text (unicode): Word text.
|
||||
tag (unicode): Part-of-speech tag.
|
||||
text (str): Word text.
|
||||
tag (str): Part-of-speech tag.
|
||||
i (int): Unique ID, typically word index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
y = self.offset_y + self.word_spacing
|
||||
x = self.offset_x + i * self.distance
|
||||
|
@ -134,12 +131,12 @@ class DependencyRenderer(object):
|
|||
def render_arrow(self, label, start, end, direction, i):
|
||||
"""Render individual arrow.
|
||||
|
||||
label (unicode): Dependency label.
|
||||
label (str): Dependency label.
|
||||
start (int): Index of start word.
|
||||
end (int): Index of end word.
|
||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||
direction (str): Arrow direction, 'left' or 'right'.
|
||||
i (int): Unique ID, typically arrow index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
if start < 0 or end < 0:
|
||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||
|
@ -182,7 +179,7 @@ class DependencyRenderer(object):
|
|||
y (int): Y-coordinate of arrow start and end point.
|
||||
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
|
||||
x_end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arc path ('d' attribute).
|
||||
RETURNS (str): Definition of the arc path ('d' attribute).
|
||||
"""
|
||||
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
|
||||
if self.compact:
|
||||
|
@ -192,11 +189,11 @@ class DependencyRenderer(object):
|
|||
def get_arrowhead(self, direction, x, y, end):
|
||||
"""Render individual arrow head.
|
||||
|
||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||
direction (str): Arrow direction, 'left' or 'right'.
|
||||
x (int): X-coordinate of arrow start point.
|
||||
y (int): Y-coordinate of arrow start and end point.
|
||||
end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||
RETURNS (str): Definition of the arrow head path ('d' attribute).
|
||||
"""
|
||||
if direction == "left":
|
||||
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||
|
@ -227,7 +224,7 @@ class DependencyRenderer(object):
|
|||
return sorted(list(levels))
|
||||
|
||||
|
||||
class EntityRenderer(object):
|
||||
class EntityRenderer:
|
||||
"""Render named entities as HTML."""
|
||||
|
||||
style = "ent"
|
||||
|
@ -282,7 +279,7 @@ class EntityRenderer(object):
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -303,9 +300,9 @@ class EntityRenderer(object):
|
|||
def render_ents(self, text, spans, title):
|
||||
"""Render entities in text.
|
||||
|
||||
text (unicode): Original text.
|
||||
text (str): Original text.
|
||||
spans (list): Individual entity spans and their start, end and label.
|
||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
markup = ""
|
||||
offset = 0
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Setting explicit height and max-width: none on the SVG is required for
|
||||
# Jupyter to render it properly in a cell
|
||||
|
||||
|
|
279
spacy/errors.py
279
spacy/errors.py
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def add_codes(err_cls):
|
||||
"""Add error codes to string messages via class attribute names."""
|
||||
|
||||
|
@ -19,17 +15,7 @@ def add_codes(err_cls):
|
|||
# fmt: off
|
||||
|
||||
@add_codes
|
||||
class Warnings(object):
|
||||
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
||||
"You can now call spacy.load with the path as its first argument, "
|
||||
"and the model's meta.json will be used to determine the language "
|
||||
"to load. For example:\nnlp = spacy.load('{path}')")
|
||||
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
|
||||
"instead and pass in the strings as the `words` keyword argument, "
|
||||
"for example:\nfrom spacy.tokens import Doc\n"
|
||||
"doc = Doc(nlp.vocab, words=[...])")
|
||||
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
|
||||
"the keyword arguments, for example tag=, lemma= or ent_type=.")
|
||||
class Warnings:
|
||||
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
|
||||
"using ftfy.fix_text if necessary.")
|
||||
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
||||
|
@ -49,12 +35,6 @@ class Warnings(object):
|
|||
"use context-sensitive tensors. You can always add your own word "
|
||||
"vectors, or use one of the larger models instead if available.")
|
||||
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
|
||||
W009 = ("Custom factory '{name}' provided by entry points of another "
|
||||
"package overwrites built-in factory.")
|
||||
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
|
||||
"limit anymore, so the max_length argument is now deprecated. "
|
||||
"If you did not specify this parameter, make sure you call the "
|
||||
"constructor with named arguments instead of positional ones.")
|
||||
W011 = ("It looks like you're calling displacy.serve from within a "
|
||||
"Jupyter notebook or a similar environment. This likely means "
|
||||
"you're already running a local web server, so there's no need to "
|
||||
|
@ -68,23 +48,9 @@ class Warnings(object):
|
|||
"components are applied. To only create tokenized Doc objects, "
|
||||
"try using `nlp.make_doc(text)` or process all texts as a stream "
|
||||
"using `list(nlp.tokenizer.pipe(all_texts))`.")
|
||||
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
||||
"efficient and less error-prone Doc.retokenize context manager "
|
||||
"instead.")
|
||||
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
|
||||
"methods is and should be replaced with `exclude`. This makes it "
|
||||
"consistent with the other serializable objects.")
|
||||
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
|
||||
"being serialized or deserialized is deprecated. Please use the "
|
||||
"`exclude` argument instead. For example: exclude=['{arg}'].")
|
||||
W016 = ("The keyword argument `n_threads` is now deprecated. As of v2.2.2, "
|
||||
"the argument `n_process` controls parallel inference via "
|
||||
"multiprocessing.")
|
||||
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
|
||||
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
|
||||
"ignoring the duplicate entry.")
|
||||
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
|
||||
"previously loaded vectors. See Issue #3853.")
|
||||
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
|
||||
"loaded. (Shape: {shape})")
|
||||
W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
|
||||
|
@ -93,10 +59,8 @@ class Warnings(object):
|
|||
"lemmatization rules or data. This means that the trained model "
|
||||
"may not be able to lemmatize correctly. If this is intentional "
|
||||
"or the language you're using doesn't have lemmatization data, "
|
||||
"please ignore this warning. If this is surprising, make sure you "
|
||||
"you can ignore this warning. If this is surprising, make sure you "
|
||||
"have the spacy-lookups-data package installed.")
|
||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
||||
"'n_process' will be set to 1.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||
|
@ -108,25 +72,11 @@ class Warnings(object):
|
|||
W028 = ("Doc.from_array was called with a vector of type '{type}', "
|
||||
"but is expecting one of type 'uint64' instead. This may result "
|
||||
"in problems with the vocab further on in the pipeline.")
|
||||
W029 = ("Unable to align tokens with entities from character offsets. "
|
||||
"Discarding entity annotation for the text: {text}.")
|
||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||
"entities \"{entities}\". Use "
|
||||
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||
" to check the alignment. Misaligned entities (with BILUO tag '-') "
|
||||
"will be ignored during training.")
|
||||
W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and "
|
||||
"is incompatible with the current spaCy version ({current}). This "
|
||||
"may lead to unexpected results or runtime errors. To resolve "
|
||||
"this, download a newer compatible model or retrain your custom "
|
||||
"model with the current spaCy version. For more details and "
|
||||
"available updates, run: python -m spacy validate")
|
||||
W032 = ("Unable to determine model compatibility for model '{model}' "
|
||||
"({model_version}) with the current spaCy version ({current}). "
|
||||
"This may lead to unexpected results or runtime errors. To resolve "
|
||||
"this, download a newer compatible model or retrain your custom "
|
||||
"model with the current spaCy version. For more details and "
|
||||
"available updates, run: python -m spacy validate")
|
||||
" to check the alignment. Misaligned entities ('-') will be "
|
||||
"ignored during training.")
|
||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||
"table. This may degrade the performance of the model to some "
|
||||
"degree. If this is intentional or the language you're using "
|
||||
|
@ -135,9 +85,44 @@ class Warnings(object):
|
|||
"package installed. The languages with lexeme normalization tables "
|
||||
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
W093 = ("Could not find any data to train the {name} on. Is your "
|
||||
"input data correctly formatted ?")
|
||||
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
|
||||
"spaCy version requirement: {version}. This can lead to compatibility "
|
||||
"problems with older versions, or as new spaCy versions are "
|
||||
"released, because the model may say it's compatible when it's "
|
||||
'not. Consider changing the "spacy_version" in your meta.json to a '
|
||||
"version range, with a lower and upper pin. For example: {example}")
|
||||
W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
|
||||
"incompatible with the current version ({current}). This may lead "
|
||||
"to unexpected results or runtime errors. To resolve this, "
|
||||
"download a newer compatible model or retrain your custom model "
|
||||
"with the current spaCy version. For more details and available "
|
||||
"updates, run: python -m spacy validate")
|
||||
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
|
||||
"instead.")
|
||||
W097 = ("No Model config was provided to create the '{name}' component, "
|
||||
"and no default configuration could be found either.")
|
||||
W098 = ("No Model config was provided to create the '{name}' component, "
|
||||
"so a default configuration was used.")
|
||||
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
||||
"but got '{type}' instead, so ignoring it.")
|
||||
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||
W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
|
||||
W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
|
||||
W103 = ("Unknown {lang} word segmenter '{segmenter}'. Supported "
|
||||
"word segmenters: {supported}. Defaulting to {default}.")
|
||||
W104 = ("Skipping modifications for '{target}' segmenter. The current "
|
||||
"segmenter is '{current}'.")
|
||||
|
||||
|
||||
@add_codes
|
||||
class Errors(object):
|
||||
class Errors:
|
||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
|
||||
"calls `nlp.create_pipe` with a component name that's not built "
|
||||
|
@ -156,21 +141,16 @@ class Errors(object):
|
|||
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
||||
E008 = ("Some current components would be lost when restoring previous "
|
||||
"pipeline state. If you added components after calling "
|
||||
"`nlp.disable_pipes()`, you should remove them explicitly with "
|
||||
"`nlp.select_pipes()`, you should remove them explicitly with "
|
||||
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
||||
"the new components: {names}")
|
||||
E009 = ("The `update` method expects same number of docs and golds, but "
|
||||
"got: {n_docs} docs, {n_golds} golds.")
|
||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||
"a model installed or loaded, or because your model doesn't "
|
||||
"include word vectors. For more info, see the docs:\n"
|
||||
"https://spacy.io/usage/models")
|
||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E013 = ("Error selecting action in matcher")
|
||||
E014 = ("Unknown tag ID: {tag}")
|
||||
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
||||
"`force=True` to overwrite.")
|
||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||
"tag, ent, dep_tag_offset, ent_tag.")
|
||||
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
||||
|
@ -178,21 +158,8 @@ class Errors(object):
|
|||
"refers to an issue with the `Vocab` or `StringStore`.")
|
||||
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
||||
"IDs are enumerated in spacy/syntax/{src}.pyx.")
|
||||
E020 = ("Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. The tree is non-projective (i.e. it has "
|
||||
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
|
||||
"The ArcEager transition system only supports projective trees. "
|
||||
"To learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass "
|
||||
"`make_projective=True` to the GoldParse class, or use "
|
||||
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||
E021 = ("Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. The GoldParse was projective. The transition "
|
||||
"system has {n_actions} actions. State at failure: {state}")
|
||||
E022 = ("Could not find a transition with the name '{name}' in the NER "
|
||||
"model.")
|
||||
E023 = ("Error cleaning up beam: The same state occurred twice at "
|
||||
"memory address {addr} and position {i}.")
|
||||
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
|
||||
"this means that the model can't be updated in a way that's valid "
|
||||
"and satisfies the correct annotations specified in the GoldParse. "
|
||||
|
@ -217,7 +184,7 @@ class Errors(object):
|
|||
"the documentation:\nhttps://spacy.io/usage/models")
|
||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
|
||||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
|
@ -227,16 +194,12 @@ class Errors(object):
|
|||
"the HEAD attribute would potentially override the sentence "
|
||||
"boundaries set by SENT_START.")
|
||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
|
||||
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
|
||||
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
|
||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||
"length {length}.")
|
||||
E036 = ("Error calculating span: Can't find a token starting at character "
|
||||
"offset {start}.")
|
||||
E037 = ("Error calculating span: Can't find a token ending at character "
|
||||
"offset {end}.")
|
||||
E038 = ("Error finding sentence for span. Infinite loop detected.")
|
||||
E039 = ("Array bounds exceeded while searching for root word. This likely "
|
||||
"means the parse tree is in an invalid state. Please report this "
|
||||
"issue here: http://github.com/explosion/spaCy/issues")
|
||||
|
@ -253,15 +216,10 @@ class Errors(object):
|
|||
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
||||
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
|
||||
"installation and permissions, or use spacy.util.set_data_path "
|
||||
"to customise the location if necessary.")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
|
||||
"link, a Python package or a valid path to a data directory.")
|
||||
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
|
||||
"it points to a valid package (not just a data directory).")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||
"package or a valid path to a data directory.")
|
||||
E052 = ("Can't find model directory: {path}")
|
||||
E053 = ("Could not read meta.json from {path}")
|
||||
E053 = ("Could not read {name} from {path}")
|
||||
E054 = ("No valid '{setting}' setting found in model meta.json.")
|
||||
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
|
||||
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
||||
|
@ -272,8 +230,6 @@ class Errors(object):
|
|||
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
||||
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
||||
"({rows}, {cols}).")
|
||||
E061 = ("Bad file name: {filename}. Example of a valid file name: "
|
||||
"'vectors.128.f.bin'")
|
||||
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
|
||||
"and 63 are occupied. You can replace one by specifying the "
|
||||
"`flag_id` explicitly, e.g. "
|
||||
|
@ -287,39 +243,17 @@ class Errors(object):
|
|||
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
|
||||
E065 = ("Only one of the vector table's width and shape can be specified. "
|
||||
"Got width {width} and shape {shape}.")
|
||||
E066 = ("Error creating model helper for extracting columns. Can only "
|
||||
"extract columns by positive integer. Got: {value}.")
|
||||
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
|
||||
"an entity) without a preceding 'B' (beginning of an entity). "
|
||||
"Tag sequence:\n{tags}")
|
||||
E068 = ("Invalid BILUO tag: '{tag}'.")
|
||||
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
|
||||
"IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
|
||||
"with tokens: {doc_tokens}.")
|
||||
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
|
||||
"does not align with number of annotations ({n_annots}).")
|
||||
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
|
||||
"match the one in the vocab ({vocab_orth}).")
|
||||
E072 = ("Error serializing lexeme: expected data length {length}, "
|
||||
"got {bad_length}.")
|
||||
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
|
||||
"are of length {length}. You can use `vocab.reset_vectors` to "
|
||||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E075 = ("Error accepting match: length ({length}) > maximum length "
|
||||
"({max_len}).")
|
||||
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
|
||||
"has {words} words.")
|
||||
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
|
||||
"equal number of GoldParse objects ({n_golds}) in batch.")
|
||||
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
|
||||
"not equal number of words in GoldParse ({words_gold}).")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
|
@ -327,8 +261,6 @@ class Errors(object):
|
|||
"`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
|
||||
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
|
||||
E085 = ("Can't create lexeme for string '{string}'.")
|
||||
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
|
||||
"not match hash {hash_id} in StringStore.")
|
||||
E087 = ("Unknown displaCy style: {style}.")
|
||||
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
||||
"v2.x parser and NER models require roughly 1GB of temporary "
|
||||
|
@ -370,17 +302,11 @@ class Errors(object):
|
|||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||
"token can only be part of one entity, so make sure the entities "
|
||||
"you're setting don't overlap.")
|
||||
E104 = ("Can't find JSON schema for '{name}'.")
|
||||
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
||||
"Doc.to_json() instead or write your own function.")
|
||||
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||
"settings: {opts}")
|
||||
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
||||
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
||||
"in favor of the pipe name `sentencizer`, which does the same "
|
||||
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
||||
E109 = ("Model for component '{name}' not initialized. Did you forget to "
|
||||
"load a model, or forget to call begin_training()?")
|
||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||
"call begin_training()?")
|
||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||
"of the parent Doc and can't exist on their own. A pickled token "
|
||||
|
@ -393,8 +319,6 @@ class Errors(object):
|
|||
"practically no advantage over pickling the parent Doc directly. "
|
||||
"So instead of pickling the span, pickle the Doc it belongs to or "
|
||||
"use Span.as_doc to convert the span to a standalone Doc object.")
|
||||
E113 = ("The newly split token can only have one root (head = 0).")
|
||||
E114 = ("The newly split token needs to have a root (head = 0).")
|
||||
E115 = ("All subtokens must have associated heads.")
|
||||
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
|
||||
"labels before training begins. This functionality was available "
|
||||
|
@ -417,16 +341,9 @@ class Errors(object):
|
|||
"equal to span length ({span_len}).")
|
||||
E122 = ("Cannot find token to be split. Did it get merged?")
|
||||
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
||||
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
|
||||
E125 = ("Unexpected value: {value}")
|
||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
E127 = ("Cannot create phrase pattern representation for length 0. This "
|
||||
"is likely a bug in spaCy.")
|
||||
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
||||
"arguments to exclude fields from being serialized or deserialized "
|
||||
"is now deprecated. Please use the `exclude` argument instead. "
|
||||
"For example: exclude=['{arg}'].")
|
||||
E129 = ("Cannot write the label of an existing Span object because a Span "
|
||||
"is a read-only view of the underlying Token objects stored in the "
|
||||
"Doc. Instead, create a new Span object and specify the `label` "
|
||||
|
@ -450,8 +367,6 @@ class Errors(object):
|
|||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
|
||||
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
|
||||
E136 = ("This additional feature requires the jsonschema library to be "
|
||||
"installed:\npip install jsonschema")
|
||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||
"to provide a valid JSON object as input with either the `text` "
|
||||
"or `tokens` key. For more info, see the docs:\n"
|
||||
|
@ -459,18 +374,13 @@ class Errors(object):
|
|||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
||||
"includes either the `text` or `tokens` key. For more info, see "
|
||||
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
|
||||
E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
|
||||
"forget to call set_kb()?")
|
||||
E139 = ("Knowledge Base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
"provided {found}.")
|
||||
E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
|
||||
"'cosine'.")
|
||||
E143 = ("Labels for component '{name}' not initialized. Did you forget to "
|
||||
"call add_label()?")
|
||||
E144 = ("Could not find parameter `{param}` when building the entity "
|
||||
"linker model.")
|
||||
E145 = ("Error reading `{param}` from input file.")
|
||||
E146 = ("Could not access `{path}`.")
|
||||
E147 = ("Unexpected error in the {method} functionality of the "
|
||||
|
@ -482,8 +392,6 @@ class Errors(object):
|
|||
"the component matches the model being loaded.")
|
||||
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
||||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||
E151 = ("Trying to call nlp.update without required annotation types. "
|
||||
"Expected top-level keys: {exp}. Got: {unexp}.")
|
||||
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||
"or EntityRuler for more details.")
|
||||
|
@ -520,11 +428,6 @@ class Errors(object):
|
|||
"that case.")
|
||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
|
||||
"happen if the tagger was trained with a different set of "
|
||||
"morphological features. If you're using a pretrained model, make "
|
||||
"sure that your models are up to date:\npython -m spacy validate")
|
||||
E168 = ("Unknown field: {field}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
||||
|
@ -532,11 +435,6 @@ class Errors(object):
|
|||
E172 = ("The Lemmatizer.load classmethod is deprecated. To create a "
|
||||
"Lemmatizer, initialize the class directly. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer")
|
||||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||
"Lookups containing the lemmatization tables. See the docs for "
|
||||
"details: https://spacy.io/api/lemmatizer#init")
|
||||
E174 = ("Architecture '{name}' not found in registry. Available "
|
||||
"names: {names}")
|
||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||
|
@ -564,9 +462,6 @@ class Errors(object):
|
|||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||
E187 = ("Only unicode strings are supported as labels.")
|
||||
E188 = ("Could not match the gold entity links to entities in the doc - "
|
||||
"make sure the gold EL data refers to valid results of the "
|
||||
"named entity recognizer in the `nlp` pipeline.")
|
||||
E189 = ("Each argument to `get_doc` should be of equal length.")
|
||||
E190 = ("Token head out of range in `Doc.from_array()` for token index "
|
||||
"'{index}' with value '{value}' (equivalent to relative head "
|
||||
|
@ -587,20 +482,74 @@ class Errors(object):
|
|||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||
"table, which contains {n_rows} vectors.")
|
||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
||||
"can not be combined with adding a pretrained Tok2Vec layer.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E969 = ("Expected string values for field '{field}', but received {types} instead. ")
|
||||
E970 = ("Can not execute command '{str_command}'. Do you have '{tool}' installed?")
|
||||
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||
"array and {doc_length} for the Doc itself.")
|
||||
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||
E973 = ("Unexpected type for NER data")
|
||||
E974 = ("Unknown {obj} attribute: {key}")
|
||||
E976 = ("The method 'Example.from_dict' expects a {type} as {n} argument, "
|
||||
"but received None.")
|
||||
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
E978 = ("The '{method}' method of {name} takes a list of Example objects, "
|
||||
"but found {types} instead.")
|
||||
E979 = ("Cannot convert {type} to an Example object.")
|
||||
E980 = ("Each link annotation should refer to a dictionary with at most one "
|
||||
"identifier mapping to 1.0, and all others to 0.0.")
|
||||
E981 = ("The offsets of the annotations for 'links' could not be aligned "
|
||||
"to token boundaries.")
|
||||
E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||
"into {values}, but found {value}.")
|
||||
E983 = ("Invalid key for '{dict}': {key}. Available keys: "
|
||||
"{keys}")
|
||||
E985 = ("The pipeline component '{component}' is already available in the base "
|
||||
"model. The settings in the component block in the config file are "
|
||||
"being ignored. If you want to replace this component instead, set "
|
||||
"'replace' to True in the training configuration.")
|
||||
E986 = ("Could not create any training batches: check your input. "
|
||||
"Perhaps discard_oversize should be set to False ?")
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
"a string, but found {type} instead.")
|
||||
E988 = ("Could not parse any training examples. Ensure the data is "
|
||||
"formatted correctly.")
|
||||
E989 = ("'nlp.update()' was called with two positional arguments. This "
|
||||
"may be due to a backwards-incompatible change to the format "
|
||||
"of the training data in spaCy 3.0 onwards. The 'update' "
|
||||
"function should now be called with a batch of 'Example' "
|
||||
"objects, instead of (text, annotation) tuples. ")
|
||||
E990 = ("An entity linking component needs to be initialized with a "
|
||||
"KnowledgeBase object, but found {type} instead.")
|
||||
E991 = ("The function 'select_pipes' should be called with either a "
|
||||
"'disable' argument to list the names of the pipe components "
|
||||
"that should be disabled, or with an 'enable' argument that "
|
||||
"specifies which pipes should not be disabled.")
|
||||
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
||||
"and `disable`={disable} but that information is conflicting "
|
||||
"for the `nlp` pipeline with components {names}.")
|
||||
E993 = ("The config for 'nlp' should include either a key 'name' to "
|
||||
"refer to an existing model by name or path, or a key 'lang' "
|
||||
"to create a new blank model.")
|
||||
E996 = ("Could not parse {file}: {msg}")
|
||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
"'{token_attrs}'.")
|
||||
E999 = ("Unable to merge the `Doc` objects because they do not all share "
|
||||
"the same `Vocab`.")
|
||||
E1000 = ("No pkuseg model available. Provide a pkuseg model when "
|
||||
"initializing the pipeline: "
|
||||
'`cfg = {"segmenter": "pkuseg", "pkuseg_model": name_or_path}; '
|
||||
'nlp = Chinese(meta={"tokenizer": {"config": cfg}})`')
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
class TempErrors:
|
||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
||||
"spaCy. We changed the name of an internal attribute for loading "
|
||||
"pretrained vectors, and the class has been passed the old name "
|
||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||
|
||||
|
||||
# fmt: on
|
||||
|
@ -610,14 +559,14 @@ class MatchPatternError(ValueError):
|
|||
def __init__(self, key, errors):
|
||||
"""Custom error for validating match patterns.
|
||||
|
||||
key (unicode): The name of the matcher rule.
|
||||
key (str): The name of the matcher rule.
|
||||
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
||||
ID, i.e. the index of the added pattern.
|
||||
"""
|
||||
msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
|
||||
msg = f"Invalid token patterns for matcher rule '{key}'\n"
|
||||
for pattern_idx, error_msgs in errors.items():
|
||||
pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
|
||||
msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
|
||||
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
||||
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
||||
ValueError.__init__(self, msg)
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
||||
term (unicode): The term to explain.
|
||||
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
|
||||
term (str): The term to explain.
|
||||
RETURNS (str): The explanation, or `None` if not found in the glossary.
|
||||
|
||||
EXAMPLE:
|
||||
>>> spacy.explain(u'NORP')
|
||||
|
|
|
@ -1,41 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .typedefs cimport attr_t
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
int* heads
|
||||
int* has_dep
|
||||
int* sent_start
|
||||
attr_t* labels
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
||||
|
||||
cdef class GoldParse:
|
||||
cdef Pool mem
|
||||
|
||||
cdef GoldParseC c
|
||||
|
||||
cdef int length
|
||||
cdef public int loss
|
||||
cdef public list words
|
||||
cdef public list tags
|
||||
cdef public list morphology
|
||||
cdef public list heads
|
||||
cdef public list labels
|
||||
cdef public dict orths
|
||||
cdef public list ner
|
||||
cdef public list ents
|
||||
cdef public dict brackets
|
||||
cdef public object cats
|
||||
cdef public dict links
|
||||
|
||||
cdef readonly list cand_to_gold
|
||||
cdef readonly list gold_to_cand
|
||||
cdef readonly list orig_annot
|
||||
|
||||
|
1004
spacy/gold.pyx
1004
spacy/gold.pyx
File diff suppressed because it is too large
Load Diff
0
spacy/gold/__init__.pxd
Normal file
0
spacy/gold/__init__.pxd
Normal file
11
spacy/gold/__init__.py
Normal file
11
spacy/gold/__init__.py
Normal file
|
@ -0,0 +1,11 @@
|
|||
from .corpus import Corpus
|
||||
from .example import Example
|
||||
from .align import Alignment
|
||||
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob
|
||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||
from .iob_utils import spans_from_biluo_tags
|
||||
from .iob_utils import tags_to_entities
|
||||
|
||||
from .gold_io import docs_to_json
|
||||
from .gold_io import read_json_file
|
30
spacy/gold/align.py
Normal file
30
spacy/gold/align.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
from typing import List
|
||||
import numpy
|
||||
from thinc.types import Ragged
|
||||
from dataclasses import dataclass
|
||||
import tokenizations
|
||||
|
||||
|
||||
@dataclass
|
||||
class Alignment:
|
||||
x2y: Ragged
|
||||
y2x: Ragged
|
||||
|
||||
@classmethod
|
||||
def from_indices(cls, x2y: List[List[int]], y2x: List[List[int]]) -> "Alignment":
|
||||
x2y = _make_ragged(x2y)
|
||||
y2x = _make_ragged(y2x)
|
||||
return Alignment(x2y=x2y, y2x=y2x)
|
||||
|
||||
@classmethod
|
||||
def from_strings(cls, A: List[str], B: List[str]) -> "Alignment":
|
||||
x2y, y2x = tokenizations.get_alignments(A, B)
|
||||
return Alignment.from_indices(x2y=x2y, y2x=y2x)
|
||||
|
||||
|
||||
def _make_ragged(indices):
|
||||
lengths = numpy.array([len(x) for x in indices], dtype="i")
|
||||
flat = []
|
||||
for x in indices:
|
||||
flat.extend(x)
|
||||
return Ragged(numpy.array(flat, dtype="i"), lengths)
|
111
spacy/gold/augment.py
Normal file
111
spacy/gold/augment.py
Normal file
|
@ -0,0 +1,111 @@
|
|||
import random
|
||||
import itertools
|
||||
|
||||
|
||||
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
|
||||
raw_text = example.text
|
||||
orig_dict = example.to_dict()
|
||||
variant_text, variant_token_annot = make_orth_variants(
|
||||
nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
|
||||
)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
return example.from_dict(doc, orig_dict)
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||
if random.random() >= orth_variant_level:
|
||||
return raw_text, orig_token_dict
|
||||
if not orig_token_dict:
|
||||
return raw_text, orig_token_dict
|
||||
raw = raw_text
|
||||
token_dict = orig_token_dict
|
||||
lower = False
|
||||
if random.random() >= 0.5:
|
||||
lower = True
|
||||
if raw is not None:
|
||||
raw = raw.lower()
|
||||
ndsv = nlp.Defaults.single_orth_variants
|
||||
ndpv = nlp.Defaults.paired_orth_variants
|
||||
words = token_dict.get("words", [])
|
||||
tags = token_dict.get("tags", [])
|
||||
# keep unmodified if words or tags are not defined
|
||||
if words and tags:
|
||||
if lower:
|
||||
words = [w.lower() for w in words]
|
||||
# single variants
|
||||
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||
for word_idx in range(len(words)):
|
||||
for punct_idx in range(len(ndsv)):
|
||||
if (
|
||||
tags[word_idx] in ndsv[punct_idx]["tags"]
|
||||
and words[word_idx] in ndsv[punct_idx]["variants"]
|
||||
):
|
||||
words[word_idx] = punct_choices[punct_idx]
|
||||
# paired variants
|
||||
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||
for word_idx in range(len(words)):
|
||||
for punct_idx in range(len(ndpv)):
|
||||
if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
|
||||
word_idx
|
||||
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||
# backup option: random left vs. right from pair
|
||||
pair_idx = random.choice([0, 1])
|
||||
# best option: rely on paired POS tags like `` / ''
|
||||
if len(ndpv[punct_idx]["tags"]) == 2:
|
||||
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
||||
# next best option: rely on position in variants
|
||||
# (may not be unambiguous, so order of variants matters)
|
||||
else:
|
||||
for pair in ndpv[punct_idx]["variants"]:
|
||||
if words[word_idx] in pair:
|
||||
pair_idx = pair.index(words[word_idx])
|
||||
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||
token_dict["words"] = words
|
||||
token_dict["tags"] = tags
|
||||
# modify raw
|
||||
if raw is not None:
|
||||
variants = []
|
||||
for single_variants in ndsv:
|
||||
variants.extend(single_variants["variants"])
|
||||
for paired_variants in ndpv:
|
||||
variants.extend(
|
||||
list(itertools.chain.from_iterable(paired_variants["variants"]))
|
||||
)
|
||||
# store variants in reverse length order to be able to prioritize
|
||||
# longer matches (e.g., "---" before "--")
|
||||
variants = sorted(variants, key=lambda x: len(x))
|
||||
variants.reverse()
|
||||
variant_raw = ""
|
||||
raw_idx = 0
|
||||
# add initial whitespace
|
||||
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||
variant_raw += raw[raw_idx]
|
||||
raw_idx += 1
|
||||
for word in words:
|
||||
match_found = False
|
||||
# skip whitespace words
|
||||
if word.isspace():
|
||||
match_found = True
|
||||
# add identical word
|
||||
elif word not in variants and raw[raw_idx:].startswith(word):
|
||||
variant_raw += word
|
||||
raw_idx += len(word)
|
||||
match_found = True
|
||||
# add variant word
|
||||
else:
|
||||
for variant in variants:
|
||||
if not match_found and raw[raw_idx:].startswith(variant):
|
||||
raw_idx += len(variant)
|
||||
variant_raw += word
|
||||
match_found = True
|
||||
# something went wrong, abort
|
||||
# (add a warning message?)
|
||||
if not match_found:
|
||||
return raw_text, orig_token_dict
|
||||
# add following whitespace
|
||||
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||
variant_raw += raw[raw_idx]
|
||||
raw_idx += 1
|
||||
raw = variant_raw
|
||||
return raw, token_dict
|
4
spacy/gold/converters/__init__.py
Normal file
4
spacy/gold/converters/__init__.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
from .iob2docs import iob2docs # noqa: F401
|
||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||
from .json2docs import json2docs
|
||||
from .conllu2docs import conllu2docs # noqa: F401
|
|
@ -1,20 +1,18 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from .. import tags_to_entities
|
||||
from ...gold import iob_to_biluo
|
||||
from ...lang.xx import MultiLanguage
|
||||
from ...tokens.doc import Doc
|
||||
from ...tokens import Doc, Span
|
||||
from ...util import load_model
|
||||
|
||||
|
||||
def conll_ner2json(
|
||||
def conll_ner2docs(
|
||||
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
||||
):
|
||||
"""
|
||||
Convert files in the CoNLL-2003 NER format and similar
|
||||
whitespace-separated columns into JSON format for use with train cli.
|
||||
whitespace-separated columns into Doc objects.
|
||||
|
||||
The first column is the tokens, the final column is the IOB tags. If an
|
||||
additional second column is present, the second column is the tags.
|
||||
|
@ -64,9 +62,9 @@ def conll_ner2json(
|
|||
# sentence segmentation required for document segmentation
|
||||
if n_sents > 0 and not seg_sents:
|
||||
msg.warn(
|
||||
"No sentence boundaries found to use with option `-n {}`. "
|
||||
"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
"to disable.".format(n_sents)
|
||||
f"No sentence boundaries found to use with option `-n {n_sents}`. "
|
||||
f"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
f"to disable."
|
||||
)
|
||||
else:
|
||||
n_sents_info(msg, n_sents)
|
||||
|
@ -84,17 +82,25 @@ def conll_ner2json(
|
|||
"No document delimiters found. Use `-n` to automatically group "
|
||||
"sentences into documents."
|
||||
)
|
||||
|
||||
if model:
|
||||
nlp = load_model(model)
|
||||
else:
|
||||
nlp = MultiLanguage()
|
||||
output_docs = []
|
||||
for doc in input_data.strip().split(doc_delimiter):
|
||||
doc = doc.strip()
|
||||
if not doc:
|
||||
for conll_doc in input_data.strip().split(doc_delimiter):
|
||||
conll_doc = conll_doc.strip()
|
||||
if not conll_doc:
|
||||
continue
|
||||
output_doc = []
|
||||
for sent in doc.split("\n\n"):
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
words = []
|
||||
sent_starts = []
|
||||
pos_tags = []
|
||||
biluo_tags = []
|
||||
for conll_sent in conll_doc.split("\n\n"):
|
||||
conll_sent = conll_sent.strip()
|
||||
if not conll_sent:
|
||||
continue
|
||||
lines = [line.strip() for line in sent.split("\n") if line.strip()]
|
||||
lines = [line.strip() for line in conll_sent.split("\n") if line.strip()]
|
||||
cols = list(zip(*[line.split() for line in lines]))
|
||||
if len(cols) < 2:
|
||||
raise ValueError(
|
||||
|
@ -102,25 +108,19 @@ def conll_ner2json(
|
|||
"Try checking whitespace and delimiters. See "
|
||||
"https://spacy.io/api/cli#convert"
|
||||
)
|
||||
words = cols[0]
|
||||
iob_ents = cols[-1]
|
||||
if len(cols) > 2:
|
||||
tags = cols[1]
|
||||
else:
|
||||
tags = ["-"] * len(words)
|
||||
biluo_ents = iob_to_biluo(iob_ents)
|
||||
output_doc.append(
|
||||
{
|
||||
"tokens": [
|
||||
{"orth": w, "tag": tag, "ner": ent}
|
||||
for (w, tag, ent) in zip(words, tags, biluo_ents)
|
||||
]
|
||||
}
|
||||
)
|
||||
output_docs.append(
|
||||
{"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]}
|
||||
)
|
||||
output_doc = []
|
||||
length = len(cols[0])
|
||||
words.extend(cols[0])
|
||||
sent_starts.extend([True] + [False] * (length - 1))
|
||||
biluo_tags.extend(iob_to_biluo(cols[-1]))
|
||||
pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length)
|
||||
|
||||
doc = Doc(nlp.vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.tag_ = pos_tags[i]
|
||||
token.is_sent_start = sent_starts[i]
|
||||
entities = tags_to_entities(biluo_tags)
|
||||
doc.ents = [Span(doc, start=s, end=e + 1, label=L) for L, s, e in entities]
|
||||
output_docs.append(doc)
|
||||
return output_docs
|
||||
|
||||
|
||||
|
@ -129,7 +129,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
if model:
|
||||
nlp = load_model(model)
|
||||
if "parser" in nlp.pipe_names:
|
||||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
||||
msg.info(f"Segmenting sentences with parser from model '{model}'.")
|
||||
sentencizer = nlp.get_pipe("parser")
|
||||
if not sentencizer:
|
||||
msg.info(
|
||||
|
@ -166,7 +166,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
|
|||
|
||||
|
||||
def n_sents_info(msg, n_sents):
|
||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
||||
msg.info(f"Grouping every {n_sents} sentences into a document.")
|
||||
if n_sents == 1:
|
||||
msg.warn(
|
||||
"To generate better training data, you may want to group "
|
295
spacy/gold/converters/conllu2docs.py
Normal file
295
spacy/gold/converters/conllu2docs.py
Normal file
|
@ -0,0 +1,295 @@
|
|||
import re
|
||||
|
||||
from .conll_ner2docs import n_sents_info
|
||||
from ...gold import Example
|
||||
from ...gold import iob_to_biluo, spans_from_biluo_tags
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token, Span
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
def conllu2docs(
|
||||
input_data,
|
||||
n_sents=10,
|
||||
append_morphology=False,
|
||||
ner_map=None,
|
||||
merge_subtokens=False,
|
||||
no_print=False,
|
||||
**_
|
||||
):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
append_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
|
||||
Extract NER tags if available and convert them so that they follow
|
||||
BILUO and the Wikipedia scheme
|
||||
"""
|
||||
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
|
||||
msg = Printer(no_print=no_print)
|
||||
n_sents_info(msg, n_sents)
|
||||
sent_docs = read_conllx(
|
||||
input_data,
|
||||
append_morphology=append_morphology,
|
||||
ner_tag_pattern=MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
merge_subtokens=merge_subtokens,
|
||||
)
|
||||
docs = []
|
||||
sent_docs_to_merge = []
|
||||
for sent_doc in sent_docs:
|
||||
sent_docs_to_merge.append(sent_doc)
|
||||
if len(sent_docs_to_merge) % n_sents == 0:
|
||||
docs.append(Doc.from_docs(sent_docs_to_merge))
|
||||
sent_docs_to_merge = []
|
||||
if sent_docs_to_merge:
|
||||
docs.append(Doc.from_docs(sent_docs_to_merge))
|
||||
return docs
|
||||
|
||||
|
||||
def has_ner(input_data, ner_tag_pattern):
|
||||
"""
|
||||
Check the MISC column for NER tags.
|
||||
"""
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
for misc_part in misc.split("|"):
|
||||
if re.match(ner_tag_pattern, misc_part):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def read_conllx(
|
||||
input_data,
|
||||
append_morphology=False,
|
||||
merge_subtokens=False,
|
||||
ner_tag_pattern="",
|
||||
ner_map=None,
|
||||
):
|
||||
""" Yield docs, one for each sentence """
|
||||
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
doc = doc_from_conllu_sentence(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
merge_subtokens=merge_subtokens,
|
||||
append_morphology=append_morphology,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
yield doc
|
||||
|
||||
|
||||
def get_entities(lines, tag_pattern, ner_map=None):
|
||||
"""Find entities in the MISC column according to the pattern and map to
|
||||
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
|
||||
the pattern is not matched.
|
||||
|
||||
lines (str): CONLL-U lines for one sentences
|
||||
tag_pattern (str): Regex pattern for entity tag
|
||||
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
|
||||
RETURNS (list): List of BILUO entity tags
|
||||
"""
|
||||
miscs = []
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
miscs.append(misc)
|
||||
|
||||
iob = []
|
||||
for misc in miscs:
|
||||
iob_tag = "O"
|
||||
for misc_part in misc.split("|"):
|
||||
tag_match = re.match(tag_pattern, misc_part)
|
||||
if tag_match:
|
||||
prefix = tag_match.group(2)
|
||||
suffix = tag_match.group(3)
|
||||
if prefix and suffix:
|
||||
iob_tag = prefix + "-" + suffix
|
||||
if ner_map:
|
||||
suffix = ner_map.get(suffix, suffix)
|
||||
if suffix == "":
|
||||
iob_tag = "O"
|
||||
else:
|
||||
iob_tag = prefix + "-" + suffix
|
||||
break
|
||||
iob.append(iob_tag)
|
||||
return iob_to_biluo(iob)
|
||||
|
||||
|
||||
def doc_from_conllu_sentence(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
merge_subtokens=False,
|
||||
append_morphology=False,
|
||||
ner_map=None,
|
||||
):
|
||||
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
||||
subtokens and appending morphology to tags if required.
|
||||
|
||||
lines (str): The non-comment lines for a CoNLL-U sentence
|
||||
ner_tag_pattern (str): The regex pattern for matching NER in MISC col
|
||||
RETURNS (Example): An example containing the annotation
|
||||
"""
|
||||
# create a Doc with each subtoken as its own token
|
||||
# if merging subtokens, each subtoken orth is the merged subtoken form
|
||||
if not Token.has_extension("merged_orth"):
|
||||
Token.set_extension("merged_orth", default="")
|
||||
if not Token.has_extension("merged_lemma"):
|
||||
Token.set_extension("merged_lemma", default="")
|
||||
if not Token.has_extension("merged_morph"):
|
||||
Token.set_extension("merged_morph", default="")
|
||||
if not Token.has_extension("merged_spaceafter"):
|
||||
Token.set_extension("merged_spaceafter", default="")
|
||||
words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
|
||||
heads, deps = [], []
|
||||
subtok_word = ""
|
||||
in_subtok = False
|
||||
for i in range(len(lines)):
|
||||
line = lines[i]
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
if "." in id_:
|
||||
continue
|
||||
if "-" in id_:
|
||||
in_subtok = True
|
||||
if "-" in id_:
|
||||
in_subtok = True
|
||||
subtok_word = word
|
||||
subtok_start, subtok_end = id_.split("-")
|
||||
subtok_spaceafter = "SpaceAfter=No" not in misc
|
||||
continue
|
||||
if merge_subtokens and in_subtok:
|
||||
words.append(subtok_word)
|
||||
else:
|
||||
words.append(word)
|
||||
if in_subtok:
|
||||
if id_ == subtok_end:
|
||||
spaces.append(subtok_spaceafter)
|
||||
else:
|
||||
spaces.append(False)
|
||||
elif "SpaceAfter=No" in misc:
|
||||
spaces.append(False)
|
||||
else:
|
||||
spaces.append(True)
|
||||
if in_subtok and id_ == subtok_end:
|
||||
subtok_word = ""
|
||||
in_subtok = False
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head not in ("0", "_") else id_
|
||||
tag = pos if tag == "_" else tag
|
||||
morph = morph if morph != "_" else ""
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
lemmas.append(lemma)
|
||||
poses.append(pos)
|
||||
tags.append(tag)
|
||||
morphs.append(morph)
|
||||
heads.append(head)
|
||||
deps.append(dep)
|
||||
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
for i in range(len(doc)):
|
||||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = poses[i]
|
||||
doc[i].dep_ = deps[i]
|
||||
doc[i].lemma_ = lemmas[i]
|
||||
doc[i].head = doc[heads[i]]
|
||||
doc[i]._.merged_orth = words[i]
|
||||
doc[i]._.merged_morph = morphs[i]
|
||||
doc[i]._.merged_lemma = lemmas[i]
|
||||
doc[i]._.merged_spaceafter = spaces[i]
|
||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||
doc.ents = spans_from_biluo_tags(doc, ents)
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
|
||||
if merge_subtokens:
|
||||
doc = merge_conllu_subtokens(lines, doc)
|
||||
|
||||
# create final Doc from custom Doc annotation
|
||||
words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
|
||||
heads, deps = [], []
|
||||
for i, t in enumerate(doc):
|
||||
words.append(t._.merged_orth)
|
||||
lemmas.append(t._.merged_lemma)
|
||||
spaces.append(t._.merged_spaceafter)
|
||||
morphs.append(t._.merged_morph)
|
||||
if append_morphology and t._.merged_morph:
|
||||
tags.append(t.tag_ + "__" + t._.merged_morph)
|
||||
else:
|
||||
tags.append(t.tag_)
|
||||
poses.append(t.pos_)
|
||||
heads.append(t.head.i)
|
||||
deps.append(t.dep_)
|
||||
|
||||
doc_x = Doc(vocab, words=words, spaces=spaces)
|
||||
for i in range(len(doc)):
|
||||
doc_x[i].tag_ = tags[i]
|
||||
doc_x[i].morph_ = morphs[i]
|
||||
doc_x[i].lemma_ = lemmas[i]
|
||||
doc_x[i].pos_ = poses[i]
|
||||
doc_x[i].dep_ = deps[i]
|
||||
doc_x[i].head = doc_x[heads[i]]
|
||||
doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
|
||||
doc_x.is_parsed = True
|
||||
doc_x.is_tagged = True
|
||||
|
||||
return doc_x
|
||||
|
||||
|
||||
def merge_conllu_subtokens(lines, doc):
|
||||
# identify and process all subtoken spans to prepare attrs for merging
|
||||
subtok_spans = []
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
if "-" in id_:
|
||||
subtok_start, subtok_end = id_.split("-")
|
||||
subtok_span = doc[int(subtok_start) - 1 : int(subtok_end)]
|
||||
subtok_spans.append(subtok_span)
|
||||
# create merged tag, morph, and lemma values
|
||||
tags = []
|
||||
morphs = {}
|
||||
lemmas = []
|
||||
for token in subtok_span:
|
||||
tags.append(token.tag_)
|
||||
lemmas.append(token.lemma_)
|
||||
if token._.merged_morph:
|
||||
for feature in token._.merged_morph.split("|"):
|
||||
field, values = feature.split("=", 1)
|
||||
if field not in morphs:
|
||||
morphs[field] = set()
|
||||
for value in values.split(","):
|
||||
morphs[field].add(value)
|
||||
# create merged features for each morph field
|
||||
for field, values in morphs.items():
|
||||
morphs[field] = field + "=" + ",".join(sorted(values))
|
||||
# set the same attrs on all subtok tokens so that whatever head the
|
||||
# retokenizer chooses, the final attrs are available on that token
|
||||
for token in subtok_span:
|
||||
token._.merged_orth = token.orth_
|
||||
token._.merged_lemma = " ".join(lemmas)
|
||||
token.tag_ = "_".join(tags)
|
||||
token._.merged_morph = "|".join(sorted(morphs.values()))
|
||||
token._.merged_spaceafter = (
|
||||
True if subtok_span[-1].whitespace_ else False
|
||||
)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
for span in subtok_spans:
|
||||
retokenizer.merge(span)
|
||||
|
||||
return doc
|
64
spacy/gold/converters/iob2docs.py
Normal file
64
spacy/gold/converters/iob2docs.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
from wasabi import Printer
|
||||
|
||||
from .conll_ner2docs import n_sents_info
|
||||
from ...gold import iob_to_biluo, tags_to_entities
|
||||
from ...tokens import Doc, Span
|
||||
from ...util import minibatch
|
||||
|
||||
|
||||
def iob2docs(input_data, vocab, n_sents=10, no_print=False, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files with one sentence per line and tags separated with '|'
|
||||
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
||||
|
||||
Sample formats:
|
||||
|
||||
I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
||||
I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O
|
||||
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
|
||||
"""
|
||||
msg = Printer(no_print=no_print)
|
||||
if n_sents > 0:
|
||||
n_sents_info(msg, n_sents)
|
||||
docs = read_iob(input_data.split("\n"), vocab, n_sents)
|
||||
return docs
|
||||
|
||||
|
||||
def read_iob(raw_sents, vocab, n_sents):
|
||||
docs = []
|
||||
for group in minibatch(raw_sents, size=n_sents):
|
||||
tokens = []
|
||||
words = []
|
||||
tags = []
|
||||
iob = []
|
||||
sent_starts = []
|
||||
for line in group:
|
||||
if not line.strip():
|
||||
continue
|
||||
sent_tokens = [t.split("|") for t in line.split()]
|
||||
if len(sent_tokens[0]) == 3:
|
||||
sent_words, sent_tags, sent_iob = zip(*sent_tokens)
|
||||
elif len(sent_tokens[0]) == 2:
|
||||
sent_words, sent_iob = zip(*sent_tokens)
|
||||
sent_tags = ["-"] * len(sent_words)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
words.extend(sent_words)
|
||||
tags.extend(sent_tags)
|
||||
iob.extend(sent_iob)
|
||||
tokens.extend(sent_tokens)
|
||||
sent_starts.append(True)
|
||||
sent_starts.extend([False for _ in sent_words[1:]])
|
||||
doc = Doc(vocab, words=words)
|
||||
for i, tag in enumerate(tags):
|
||||
doc[i].tag_ = tag
|
||||
for i, sent_start in enumerate(sent_starts):
|
||||
doc[i].is_sent_start = sent_start
|
||||
biluo = iob_to_biluo(iob)
|
||||
entities = tags_to_entities(biluo)
|
||||
doc.ents = [Span(doc, start=s, end=e + 1, label=L) for (L, s, e) in entities]
|
||||
docs.append(doc)
|
||||
return docs
|
22
spacy/gold/converters/json2docs.py
Normal file
22
spacy/gold/converters/json2docs.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
import srsly
|
||||
from ..gold_io import json_iterate, json_to_annotations
|
||||
from ..example import annotations2doc
|
||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||
from ...util import load_model
|
||||
from ...lang.xx import MultiLanguage
|
||||
|
||||
|
||||
def json2docs(input_data, model=None, **kwargs):
|
||||
nlp = load_model(model) if model is not None else MultiLanguage()
|
||||
if not isinstance(input_data, bytes):
|
||||
if not isinstance(input_data, str):
|
||||
input_data = srsly.json_dumps(input_data)
|
||||
input_data = input_data.encode("utf8")
|
||||
docs = []
|
||||
for json_doc in json_iterate(input_data):
|
||||
for json_para in json_to_annotations(json_doc):
|
||||
example_dict = _fix_legacy_dict_data(json_para)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||
docs.append(doc)
|
||||
return docs
|
129
spacy/gold/corpus.py
Normal file
129
spacy/gold/corpus.py
Normal file
|
@ -0,0 +1,129 @@
|
|||
import random
|
||||
from .. import util
|
||||
from .example import Example
|
||||
from ..tokens import DocBin, Doc
|
||||
|
||||
|
||||
class Corpus:
|
||||
"""An annotated corpus, reading train and dev datasets from
|
||||
the DocBin (.spacy) format.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus
|
||||
"""
|
||||
|
||||
def __init__(self, train_loc, dev_loc, limit=0):
|
||||
"""Create a Corpus.
|
||||
|
||||
train (str / Path): File or directory of training data.
|
||||
dev (str / Path): File or directory of development data.
|
||||
limit (int): Max. number of examples returned
|
||||
RETURNS (Corpus): The newly created object.
|
||||
"""
|
||||
self.train_loc = train_loc
|
||||
self.dev_loc = dev_loc
|
||||
self.limit = limit
|
||||
|
||||
@staticmethod
|
||||
def walk_corpus(path):
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir():
|
||||
return [path]
|
||||
paths = [path]
|
||||
locs = []
|
||||
seen = set()
|
||||
for path in paths:
|
||||
if str(path) in seen:
|
||||
continue
|
||||
seen.add(str(path))
|
||||
if path.parts[-1].startswith("."):
|
||||
continue
|
||||
elif path.is_dir():
|
||||
paths.extend(path.iterdir())
|
||||
elif path.parts[-1].endswith(".spacy"):
|
||||
locs.append(path)
|
||||
return locs
|
||||
|
||||
def _make_example(self, nlp, reference, gold_preproc):
|
||||
if gold_preproc or reference.has_unknown_spaces:
|
||||
return Example(
|
||||
Doc(
|
||||
nlp.vocab,
|
||||
words=[word.text for word in reference],
|
||||
spaces=[bool(word.whitespace_) for word in reference],
|
||||
),
|
||||
reference,
|
||||
)
|
||||
else:
|
||||
return Example(nlp.make_doc(reference.text), reference)
|
||||
|
||||
def make_examples(self, nlp, reference_docs, max_length=0):
|
||||
for reference in reference_docs:
|
||||
if len(reference) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(reference) < max_length:
|
||||
yield self._make_example(nlp, reference, False)
|
||||
elif reference.is_sentenced:
|
||||
for ref_sent in reference.sents:
|
||||
if len(ref_sent) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(ref_sent) < max_length:
|
||||
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
||||
|
||||
def make_examples_gold_preproc(self, nlp, reference_docs):
|
||||
for reference in reference_docs:
|
||||
if reference.is_sentenced:
|
||||
ref_sents = [sent.as_doc() for sent in reference.sents]
|
||||
else:
|
||||
ref_sents = [reference]
|
||||
for ref_sent in ref_sents:
|
||||
eg = self._make_example(nlp, ref_sent, True)
|
||||
if len(eg.x):
|
||||
yield eg
|
||||
|
||||
def read_docbin(self, vocab, locs):
|
||||
""" Yield training examples as example dicts """
|
||||
i = 0
|
||||
for loc in locs:
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith(".spacy"):
|
||||
with loc.open("rb") as file_:
|
||||
doc_bin = DocBin().from_bytes(file_.read())
|
||||
docs = doc_bin.get_docs(vocab)
|
||||
for doc in docs:
|
||||
if len(doc):
|
||||
yield doc
|
||||
i += 1
|
||||
if self.limit >= 1 and i >= self.limit:
|
||||
break
|
||||
|
||||
def count_train(self, nlp):
|
||||
"""Returns count of words in train examples"""
|
||||
n = 0
|
||||
i = 0
|
||||
for example in self.train_dataset(nlp):
|
||||
n += len(example.predicted)
|
||||
if self.limit >= 0 and i >= self.limit:
|
||||
break
|
||||
i += 1
|
||||
return n
|
||||
|
||||
def train_dataset(
|
||||
self, nlp, *, shuffle=True, gold_preproc=False, max_length=0, **kwargs
|
||||
):
|
||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
|
||||
if gold_preproc:
|
||||
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||
else:
|
||||
examples = self.make_examples(nlp, ref_docs, max_length)
|
||||
if shuffle:
|
||||
examples = list(examples)
|
||||
random.shuffle(examples)
|
||||
yield from examples
|
||||
|
||||
def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
|
||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
|
||||
if gold_preproc:
|
||||
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||
else:
|
||||
examples = self.make_examples(nlp, ref_docs, max_length=0)
|
||||
yield from examples
|
7
spacy/gold/example.pxd
Normal file
7
spacy/gold/example.pxd
Normal file
|
@ -0,0 +1,7 @@
|
|||
from ..tokens.doc cimport Doc
|
||||
|
||||
|
||||
cdef class Example:
|
||||
cdef readonly Doc x
|
||||
cdef readonly Doc y
|
||||
cdef readonly object _alignment
|
427
spacy/gold/example.pyx
Normal file
427
spacy/gold/example.pyx
Normal file
|
@ -0,0 +1,427 @@
|
|||
import warnings
|
||||
|
||||
import numpy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..tokens.span cimport Span
|
||||
from ..tokens.span import Span
|
||||
from ..attrs import IDS
|
||||
from .align import Alignment
|
||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||
from .iob_utils import spans_from_biluo_tags
|
||||
from ..errors import Errors, Warnings
|
||||
from ..syntax import nonproj
|
||||
|
||||
|
||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||
""" Create a Doc from dictionaries with token and doc annotations. """
|
||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||
if "entities" in doc_annot:
|
||||
_add_entities_to_doc(output, doc_annot["entities"])
|
||||
if array.size:
|
||||
output = output.from_array(attrs, array)
|
||||
# links are currently added with ENT_KB_ID on the token level
|
||||
output.cats.update(doc_annot.get("cats", {}))
|
||||
return output
|
||||
|
||||
|
||||
cdef class Example:
|
||||
def __init__(self, Doc predicted, Doc reference, *, alignment=None):
|
||||
if predicted is None:
|
||||
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||
if reference is None:
|
||||
raise TypeError(Errors.E972.format(arg="reference"))
|
||||
self.x = predicted
|
||||
self.y = reference
|
||||
self._alignment = alignment
|
||||
|
||||
def __len__(self):
|
||||
return len(self.predicted)
|
||||
|
||||
property predicted:
|
||||
def __get__(self):
|
||||
return self.x
|
||||
|
||||
def __set__(self, doc):
|
||||
self.x = doc
|
||||
|
||||
property reference:
|
||||
def __get__(self):
|
||||
return self.y
|
||||
|
||||
def __set__(self, doc):
|
||||
self.y = doc
|
||||
|
||||
def copy(self):
|
||||
return Example(
|
||||
self.x.copy(),
|
||||
self.y.copy()
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, Doc predicted, dict example_dict):
|
||||
if predicted is None:
|
||||
raise ValueError(Errors.E976.format(n="first", type="Doc"))
|
||||
if example_dict is None:
|
||||
raise ValueError(Errors.E976.format(n="second", type="dict"))
|
||||
example_dict = _fix_legacy_dict_data(example_dict)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
if "ORTH" not in tok_dict:
|
||||
tok_dict["ORTH"] = [tok.text for tok in predicted]
|
||||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||
return Example(
|
||||
predicted,
|
||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||
)
|
||||
|
||||
@property
|
||||
def alignment(self):
|
||||
if self._alignment is None:
|
||||
spacy_words = [token.orth_ for token in self.predicted]
|
||||
gold_words = [token.orth_ for token in self.reference]
|
||||
if gold_words == []:
|
||||
gold_words = spacy_words
|
||||
self._alignment = Alignment.from_strings(spacy_words, gold_words)
|
||||
return self._alignment
|
||||
|
||||
def get_aligned(self, field, as_string=False):
|
||||
"""Return an aligned array for a token attribute."""
|
||||
align = self.alignment.x2y
|
||||
|
||||
vocab = self.reference.vocab
|
||||
gold_values = self.reference.to_array([field])
|
||||
output = [None] * len(self.predicted)
|
||||
for token in self.predicted:
|
||||
if token.is_space:
|
||||
output[token.i] = None
|
||||
else:
|
||||
values = gold_values[align[token.i].dataXd]
|
||||
values = values.ravel()
|
||||
if len(values) == 0:
|
||||
output[token.i] = None
|
||||
elif len(values) == 1:
|
||||
output[token.i] = values[0]
|
||||
elif len(set(list(values))) == 1:
|
||||
# If all aligned tokens have the same value, use it.
|
||||
output[token.i] = values[0]
|
||||
else:
|
||||
output[token.i] = None
|
||||
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||
return output
|
||||
|
||||
def get_aligned_parse(self, projectivize=True):
|
||||
cand_to_gold = self.alignment.x2y
|
||||
gold_to_cand = self.alignment.y2x
|
||||
aligned_heads = [None] * self.x.length
|
||||
aligned_deps = [None] * self.x.length
|
||||
heads = [token.head.i for token in self.y]
|
||||
deps = [token.dep_ for token in self.y]
|
||||
if projectivize:
|
||||
heads, deps = nonproj.projectivize(heads, deps)
|
||||
for cand_i in range(self.x.length):
|
||||
if cand_to_gold.lengths[cand_i] == 1:
|
||||
gold_i = cand_to_gold[cand_i].dataXd[0, 0]
|
||||
if gold_to_cand.lengths[heads[gold_i]] == 1:
|
||||
aligned_heads[cand_i] = int(gold_to_cand[heads[gold_i]].dataXd[0, 0])
|
||||
aligned_deps[cand_i] = deps[gold_i]
|
||||
return aligned_heads, aligned_deps
|
||||
|
||||
def get_aligned_spans_x2y(self, x_spans):
|
||||
return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
|
||||
|
||||
def get_aligned_spans_y2x(self, y_spans):
|
||||
return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
|
||||
|
||||
def _get_aligned_spans(self, doc, spans, align):
|
||||
seen = set()
|
||||
output = []
|
||||
for span in spans:
|
||||
indices = align[span.start : span.end].data.ravel()
|
||||
indices = [idx for idx in indices if idx not in seen]
|
||||
if len(indices) >= 1:
|
||||
aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
|
||||
target_text = span.text.lower().strip().replace(" ", "")
|
||||
our_text = aligned_span.text.lower().strip().replace(" ", "")
|
||||
if our_text == target_text:
|
||||
output.append(aligned_span)
|
||||
seen.update(indices)
|
||||
return output
|
||||
|
||||
def get_aligned_ner(self):
|
||||
if not self.y.is_nered:
|
||||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
||||
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
||||
# Default to 'None' for missing values
|
||||
x_tags = biluo_tags_from_offsets(
|
||||
self.x,
|
||||
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||
missing=None
|
||||
)
|
||||
# Now fill the tokens we can align to O.
|
||||
O = 2 # I=1, O=2, B=3
|
||||
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||
if x_tags[i] is None:
|
||||
if ent_iob == O:
|
||||
x_tags[i] = "O"
|
||||
elif self.x[i].is_space:
|
||||
x_tags[i] = "O"
|
||||
return x_tags
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"doc_annotation": {
|
||||
"cats": dict(self.reference.cats),
|
||||
"entities": biluo_tags_from_doc(self.reference),
|
||||
"links": self._links_to_dict()
|
||||
},
|
||||
"token_annotation": {
|
||||
"ids": [t.i+1 for t in self.reference],
|
||||
"words": [t.text for t in self.reference],
|
||||
"tags": [t.tag_ for t in self.reference],
|
||||
"lemmas": [t.lemma_ for t in self.reference],
|
||||
"pos": [t.pos_ for t in self.reference],
|
||||
"morphs": [t.morph_ for t in self.reference],
|
||||
"heads": [t.head.i for t in self.reference],
|
||||
"deps": [t.dep_ for t in self.reference],
|
||||
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference]
|
||||
}
|
||||
}
|
||||
|
||||
def _links_to_dict(self):
|
||||
links = {}
|
||||
for ent in self.reference.ents:
|
||||
if ent.kb_id_:
|
||||
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
|
||||
return links
|
||||
|
||||
def split_sents(self):
|
||||
""" Split the token annotations into multiple Examples based on
|
||||
sent_starts and return a list of the new Examples"""
|
||||
if not self.reference.is_sentenced:
|
||||
return [self]
|
||||
|
||||
align = self.alignment.y2x
|
||||
seen_indices = set()
|
||||
output = []
|
||||
for y_sent in self.reference.sents:
|
||||
indices = align[y_sent.start : y_sent.end].data.ravel()
|
||||
indices = [idx for idx in indices if idx not in seen_indices]
|
||||
if indices:
|
||||
x_sent = self.predicted[indices[0] : indices[-1] + 1]
|
||||
output.append(Example(x_sent.as_doc(), y_sent.as_doc()))
|
||||
seen_indices.update(indices)
|
||||
return output
|
||||
|
||||
property text:
|
||||
def __get__(self):
|
||||
return self.x.text
|
||||
|
||||
def __str__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.to_dict())
|
||||
|
||||
|
||||
def _annot2array(vocab, tok_annot, doc_annot):
|
||||
attrs = []
|
||||
values = []
|
||||
|
||||
for key, value in doc_annot.items():
|
||||
if value:
|
||||
if key == "entities":
|
||||
pass
|
||||
elif key == "links":
|
||||
ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
|
||||
tok_annot["ENT_KB_ID"] = ent_kb_ids
|
||||
elif key == "cats":
|
||||
pass
|
||||
else:
|
||||
raise ValueError(Errors.E974.format(obj="doc", key=key))
|
||||
|
||||
for key, value in tok_annot.items():
|
||||
if key not in IDS:
|
||||
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||
elif key in ["ORTH", "SPACY"]:
|
||||
pass
|
||||
elif key == "HEAD":
|
||||
attrs.append(key)
|
||||
values.append([h-i for i, h in enumerate(value)])
|
||||
elif key == "SENT_START":
|
||||
attrs.append(key)
|
||||
values.append(value)
|
||||
elif key == "MORPH":
|
||||
attrs.append(key)
|
||||
values.append([vocab.morphology.add(v) for v in value])
|
||||
else:
|
||||
attrs.append(key)
|
||||
try:
|
||||
values.append([vocab.strings.add(v) for v in value])
|
||||
except TypeError:
|
||||
types= set([type(v) for v in value])
|
||||
raise TypeError(Errors.E969.format(field=key, types=types))
|
||||
|
||||
array = numpy.asarray(values, dtype="uint64")
|
||||
return attrs, array.T
|
||||
|
||||
|
||||
def _add_entities_to_doc(doc, ner_data):
|
||||
if ner_data is None:
|
||||
return
|
||||
elif ner_data == []:
|
||||
doc.ents = []
|
||||
elif isinstance(ner_data[0], tuple):
|
||||
return _add_entities_to_doc(
|
||||
doc,
|
||||
biluo_tags_from_offsets(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], str) or ner_data[0] is None:
|
||||
return _add_entities_to_doc(
|
||||
doc,
|
||||
spans_from_biluo_tags(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], Span):
|
||||
# Ugh, this is super messy. Really hard to set O entities
|
||||
doc.ents = ner_data
|
||||
doc.ents = [span for span in ner_data if span.label_]
|
||||
else:
|
||||
raise ValueError(Errors.E973)
|
||||
|
||||
|
||||
def _parse_example_dict_data(example_dict):
|
||||
return (
|
||||
example_dict["token_annotation"],
|
||||
example_dict["doc_annotation"]
|
||||
)
|
||||
|
||||
|
||||
def _fix_legacy_dict_data(example_dict):
|
||||
token_dict = example_dict.get("token_annotation", {})
|
||||
doc_dict = example_dict.get("doc_annotation", {})
|
||||
for key, value in example_dict.items():
|
||||
if value:
|
||||
if key in ("token_annotation", "doc_annotation"):
|
||||
pass
|
||||
elif key == "ids":
|
||||
pass
|
||||
elif key in ("cats", "links"):
|
||||
doc_dict[key] = value
|
||||
elif key in ("ner", "entities"):
|
||||
doc_dict["entities"] = value
|
||||
else:
|
||||
token_dict[key] = value
|
||||
# Remap keys
|
||||
remapping = {
|
||||
"words": "ORTH",
|
||||
"tags": "TAG",
|
||||
"pos": "POS",
|
||||
"lemmas": "LEMMA",
|
||||
"deps": "DEP",
|
||||
"heads": "HEAD",
|
||||
"sent_starts": "SENT_START",
|
||||
"morphs": "MORPH",
|
||||
"spaces": "SPACY",
|
||||
}
|
||||
old_token_dict = token_dict
|
||||
token_dict = {}
|
||||
for key, value in old_token_dict.items():
|
||||
if key in ("text", "ids", "brackets"):
|
||||
pass
|
||||
elif key.lower() in remapping:
|
||||
token_dict[remapping[key.lower()]] = value
|
||||
else:
|
||||
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
|
||||
text = example_dict.get("text", example_dict.get("raw"))
|
||||
if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"):
|
||||
token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
|
||||
if "HEAD" in token_dict and "SENT_START" in token_dict:
|
||||
# If heads are set, we don't also redundantly specify SENT_START.
|
||||
token_dict.pop("SENT_START")
|
||||
warnings.warn(Warnings.W092)
|
||||
return {
|
||||
"token_annotation": token_dict,
|
||||
"doc_annotation": doc_dict
|
||||
}
|
||||
|
||||
def _has_field(annot, field):
|
||||
if field not in annot:
|
||||
return False
|
||||
elif annot[field] is None:
|
||||
return False
|
||||
elif len(annot[field]) == 0:
|
||||
return False
|
||||
elif all([value is None for value in annot[field]]):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
||||
if isinstance(biluo_or_offsets[0], (list, tuple)):
|
||||
# Convert to biluo if necessary
|
||||
# This is annoying but to convert the offsets we need a Doc
|
||||
# that has the target tokenization.
|
||||
reference = Doc(vocab, words=words, spaces=spaces)
|
||||
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
|
||||
else:
|
||||
biluo = biluo_or_offsets
|
||||
ent_iobs = []
|
||||
ent_types = []
|
||||
for iob_tag in biluo_to_iob(biluo):
|
||||
if iob_tag in (None, "-"):
|
||||
ent_iobs.append("")
|
||||
ent_types.append("")
|
||||
else:
|
||||
ent_iobs.append(iob_tag.split("-")[0])
|
||||
if iob_tag.startswith("I") or iob_tag.startswith("B"):
|
||||
ent_types.append(iob_tag.split("-", 1)[1])
|
||||
else:
|
||||
ent_types.append("")
|
||||
return ent_iobs, ent_types
|
||||
|
||||
def _parse_links(vocab, words, spaces, links):
|
||||
reference = Doc(vocab, words=words, spaces=spaces)
|
||||
starts = {token.idx: token.i for token in reference}
|
||||
ends = {token.idx + len(token): token.i for token in reference}
|
||||
ent_kb_ids = ["" for _ in reference]
|
||||
|
||||
for index, annot_dict in links.items():
|
||||
true_kb_ids = []
|
||||
for key, value in annot_dict.items():
|
||||
if value == 1.0:
|
||||
true_kb_ids.append(key)
|
||||
if len(true_kb_ids) > 1:
|
||||
raise ValueError(Errors.E980)
|
||||
|
||||
if len(true_kb_ids) == 1:
|
||||
start_char, end_char = index
|
||||
start_token = starts.get(start_char)
|
||||
end_token = ends.get(end_char)
|
||||
if start_token is None or end_token is None:
|
||||
raise ValueError(Errors.E981)
|
||||
for i in range(start_token, end_token+1):
|
||||
ent_kb_ids[i] = true_kb_ids[0]
|
||||
|
||||
return ent_kb_ids
|
||||
|
||||
|
||||
def _guess_spaces(text, words):
|
||||
if text is None:
|
||||
return None
|
||||
spaces = []
|
||||
text_pos = 0
|
||||
# align words with text
|
||||
for word in words:
|
||||
try:
|
||||
word_start = text[text_pos:].index(word)
|
||||
except ValueError:
|
||||
spaces.append(True)
|
||||
continue
|
||||
text_pos += word_start + len(word)
|
||||
if text_pos < len(text) and text[text_pos] == " ":
|
||||
spaces.append(True)
|
||||
else:
|
||||
spaces.append(False)
|
||||
return spaces
|
201
spacy/gold/gold_io.pyx
Normal file
201
spacy/gold/gold_io.pyx
Normal file
|
@ -0,0 +1,201 @@
|
|||
import warnings
|
||||
import srsly
|
||||
from .. import util
|
||||
from ..errors import Warnings
|
||||
from ..tokens import Doc
|
||||
from .iob_utils import biluo_tags_from_offsets, tags_to_entities
|
||||
import json
|
||||
|
||||
|
||||
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||
the spacy train command.
|
||||
|
||||
docs (iterable / Doc): The Doc object(s) to convert.
|
||||
doc_id (int): Id for the JSON.
|
||||
RETURNS (dict): The data in spaCy's JSON format
|
||||
- each input doc will be treated as a paragraph in the output doc
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
json_doc = {"id": doc_id, "paragraphs": []}
|
||||
for i, doc in enumerate(docs):
|
||||
json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
|
||||
for cat, val in doc.cats.items():
|
||||
json_cat = {"label": cat, "value": val}
|
||||
json_para["cats"].append(json_cat)
|
||||
# warning: entities information is currently duplicated as
|
||||
# doc-level "entities" and token-level "ner"
|
||||
for ent in doc.ents:
|
||||
ent_tuple = (ent.start_char, ent.end_char, ent.label_)
|
||||
json_para["entities"].append(ent_tuple)
|
||||
if ent.kb_id_:
|
||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||
json_para["links"].append(link_dict)
|
||||
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
for j, sent in enumerate(doc.sents):
|
||||
json_sent = {"tokens": [], "brackets": []}
|
||||
for token in sent:
|
||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
||||
if doc.is_tagged:
|
||||
json_token["tag"] = token.tag_
|
||||
json_token["pos"] = token.pos_
|
||||
json_token["morph"] = token.morph_
|
||||
json_token["lemma"] = token.lemma_
|
||||
if doc.is_parsed:
|
||||
json_token["head"] = token.head.i-token.i
|
||||
json_token["dep"] = token.dep_
|
||||
json_token["ner"] = biluo_tags[token.i]
|
||||
json_sent["tokens"].append(json_token)
|
||||
json_para["sentences"].append(json_sent)
|
||||
json_doc["paragraphs"].append(json_para)
|
||||
return json_doc
|
||||
|
||||
|
||||
def read_json_file(loc, docs_filter=None, limit=None):
|
||||
"""Read Example dictionaries from a json file or directory."""
|
||||
loc = util.ensure_path(loc)
|
||||
if loc.is_dir():
|
||||
for filename in loc.iterdir():
|
||||
yield from read_json_file(loc / filename, limit=limit)
|
||||
else:
|
||||
with loc.open("rb") as file_:
|
||||
utf8_str = file_.read()
|
||||
for json_doc in json_iterate(utf8_str):
|
||||
if docs_filter is not None and not docs_filter(json_doc):
|
||||
continue
|
||||
for json_paragraph in json_to_annotations(json_doc):
|
||||
yield json_paragraph
|
||||
|
||||
|
||||
def json_to_annotations(doc):
|
||||
"""Convert an item in the JSON-formatted training data to the format
|
||||
used by Example.
|
||||
|
||||
doc (dict): One entry in the training data.
|
||||
YIELDS (tuple): The reformatted data - one training example per paragraph
|
||||
"""
|
||||
for paragraph in doc["paragraphs"]:
|
||||
example = {"text": paragraph.get("raw", None)}
|
||||
words = []
|
||||
spaces = []
|
||||
ids = []
|
||||
tags = []
|
||||
ner_tags = []
|
||||
pos = []
|
||||
morphs = []
|
||||
lemmas = []
|
||||
heads = []
|
||||
labels = []
|
||||
sent_starts = []
|
||||
brackets = []
|
||||
for sent in paragraph["sentences"]:
|
||||
sent_start_i = len(words)
|
||||
for i, token in enumerate(sent["tokens"]):
|
||||
words.append(token["orth"])
|
||||
spaces.append(token.get("space", None))
|
||||
ids.append(token.get('id', sent_start_i + i))
|
||||
tags.append(token.get("tag", None))
|
||||
pos.append(token.get("pos", None))
|
||||
morphs.append(token.get("morph", None))
|
||||
lemmas.append(token.get("lemma", None))
|
||||
if "head" in token:
|
||||
heads.append(token["head"] + sent_start_i + i)
|
||||
else:
|
||||
heads.append(None)
|
||||
if "dep" in token:
|
||||
labels.append(token["dep"])
|
||||
# Ensure ROOT label is case-insensitive
|
||||
if labels[-1].lower() == "root":
|
||||
labels[-1] = "ROOT"
|
||||
else:
|
||||
labels.append(None)
|
||||
ner_tags.append(token.get("ner", None))
|
||||
if i == 0:
|
||||
sent_starts.append(1)
|
||||
else:
|
||||
sent_starts.append(0)
|
||||
if "brackets" in sent:
|
||||
brackets.extend((b["first"] + sent_start_i,
|
||||
b["last"] + sent_start_i, b["label"])
|
||||
for b in sent["brackets"])
|
||||
|
||||
example["token_annotation"] = dict(
|
||||
ids=ids,
|
||||
words=words,
|
||||
spaces=spaces,
|
||||
sent_starts=sent_starts,
|
||||
brackets=brackets
|
||||
)
|
||||
# avoid including dummy values that looks like gold info was present
|
||||
if any(tags):
|
||||
example["token_annotation"]["tags"] = tags
|
||||
if any(pos):
|
||||
example["token_annotation"]["pos"] = pos
|
||||
if any(morphs):
|
||||
example["token_annotation"]["morphs"] = morphs
|
||||
if any(lemmas):
|
||||
example["token_annotation"]["lemmas"] = lemmas
|
||||
if any(head is not None for head in heads):
|
||||
example["token_annotation"]["heads"] = heads
|
||||
if any(labels):
|
||||
example["token_annotation"]["deps"] = labels
|
||||
|
||||
cats = {}
|
||||
for cat in paragraph.get("cats", {}):
|
||||
cats[cat["label"]] = cat["value"]
|
||||
example["doc_annotation"] = dict(
|
||||
cats=cats,
|
||||
entities=ner_tags,
|
||||
links=paragraph.get("links", [])
|
||||
)
|
||||
yield example
|
||||
|
||||
def json_iterate(bytes utf8_str):
|
||||
# We should've made these files jsonl...But since we didn't, parse out
|
||||
# the docs one-by-one to reduce memory usage.
|
||||
# It's okay to read in the whole file -- just don't parse it into JSON.
|
||||
cdef long file_length = len(utf8_str)
|
||||
if file_length > 2 ** 30:
|
||||
warnings.warn(Warnings.W027.format(size=file_length))
|
||||
|
||||
raw = <char*>utf8_str
|
||||
cdef int square_depth = 0
|
||||
cdef int curly_depth = 0
|
||||
cdef int inside_string = 0
|
||||
cdef int escape = 0
|
||||
cdef long start = -1
|
||||
cdef char c
|
||||
cdef char quote = ord('"')
|
||||
cdef char backslash = ord("\\")
|
||||
cdef char open_square = ord("[")
|
||||
cdef char close_square = ord("]")
|
||||
cdef char open_curly = ord("{")
|
||||
cdef char close_curly = ord("}")
|
||||
for i in range(file_length):
|
||||
c = raw[i]
|
||||
if escape:
|
||||
escape = False
|
||||
continue
|
||||
if c == backslash:
|
||||
escape = True
|
||||
continue
|
||||
if c == quote:
|
||||
inside_string = not inside_string
|
||||
continue
|
||||
if inside_string:
|
||||
continue
|
||||
if c == open_square:
|
||||
square_depth += 1
|
||||
elif c == close_square:
|
||||
square_depth -= 1
|
||||
elif c == open_curly:
|
||||
if square_depth == 1 and curly_depth == 0:
|
||||
start = i
|
||||
curly_depth += 1
|
||||
elif c == close_curly:
|
||||
curly_depth -= 1
|
||||
if square_depth == 1 and curly_depth == 0:
|
||||
substr = utf8_str[start : i + 1].decode("utf8")
|
||||
yield srsly.json_loads(substr)
|
||||
start = -1
|
209
spacy/gold/iob_utils.py
Normal file
209
spacy/gold/iob_utils.py
Normal file
|
@ -0,0 +1,209 @@
|
|||
import warnings
|
||||
from ..errors import Errors, Warnings
|
||||
from ..tokens import Span
|
||||
|
||||
|
||||
def iob_to_biluo(tags):
|
||||
out = []
|
||||
tags = list(tags)
|
||||
while tags:
|
||||
out.extend(_consume_os(tags))
|
||||
out.extend(_consume_ent(tags))
|
||||
return out
|
||||
|
||||
|
||||
def biluo_to_iob(tags):
|
||||
out = []
|
||||
for tag in tags:
|
||||
if tag is None:
|
||||
out.append(tag)
|
||||
else:
|
||||
tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
|
||||
out.append(tag)
|
||||
return out
|
||||
|
||||
|
||||
def _consume_os(tags):
|
||||
while tags and tags[0] == "O":
|
||||
yield tags.pop(0)
|
||||
|
||||
|
||||
def _consume_ent(tags):
|
||||
if not tags:
|
||||
return []
|
||||
tag = tags.pop(0)
|
||||
target_in = "I" + tag[1:]
|
||||
target_last = "L" + tag[1:]
|
||||
length = 1
|
||||
while tags and tags[0] in {target_in, target_last}:
|
||||
length += 1
|
||||
tags.pop(0)
|
||||
label = tag[2:]
|
||||
if length == 1:
|
||||
if len(label) == 0:
|
||||
raise ValueError(Errors.E177.format(tag=tag))
|
||||
return ["U-" + label]
|
||||
else:
|
||||
start = "B-" + label
|
||||
end = "L-" + label
|
||||
middle = [f"I-{label}" for _ in range(1, length - 1)]
|
||||
return [start] + middle + [end]
|
||||
|
||||
|
||||
def biluo_tags_from_doc(doc, missing="O"):
|
||||
return biluo_tags_from_offsets(
|
||||
doc,
|
||||
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
|
||||
missing=missing,
|
||||
)
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||
"""Encode labelled spans into per-token tags, using the
|
||||
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||
|
||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||
will refer to the token boundaries within the document.
|
||||
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
||||
and `end` should be character-offset integers denoting the slice into
|
||||
the original string.
|
||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||
string will be of the form either "", "O" or "{action}-{label}", where
|
||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||
entity offsets don't align with the tokenization in the `Doc` object.
|
||||
The training algorithm will view these as missing values. "O" denotes a
|
||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||
|
||||
EXAMPLE:
|
||||
>>> text = 'I like London.'
|
||||
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
>>> doc = nlp.tokenizer(text)
|
||||
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
||||
"""
|
||||
# Ensure no overlapping entity labels exist
|
||||
tokens_in_ents = {}
|
||||
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx + len(token): token.i for token in doc}
|
||||
biluo = ["-" for _ in doc]
|
||||
# Handle entity cases
|
||||
for start_char, end_char, label in entities:
|
||||
if not label:
|
||||
for s in starts: # account for many-to-one
|
||||
if s >= start_char and s < end_char:
|
||||
biluo[starts[s]] = "O"
|
||||
else:
|
||||
for token_index in range(start_char, end_char):
|
||||
if token_index in tokens_in_ents.keys():
|
||||
raise ValueError(
|
||||
Errors.E103.format(
|
||||
span1=(
|
||||
tokens_in_ents[token_index][0],
|
||||
tokens_in_ents[token_index][1],
|
||||
tokens_in_ents[token_index][2],
|
||||
),
|
||||
span2=(start_char, end_char, label),
|
||||
)
|
||||
)
|
||||
tokens_in_ents[token_index] = (start_char, end_char, label)
|
||||
|
||||
start_token = starts.get(start_char)
|
||||
end_token = ends.get(end_char)
|
||||
# Only interested if the tokenization is correct
|
||||
if start_token is not None and end_token is not None:
|
||||
if start_token == end_token:
|
||||
biluo[start_token] = f"U-{label}"
|
||||
else:
|
||||
biluo[start_token] = f"B-{label}"
|
||||
for i in range(start_token + 1, end_token):
|
||||
biluo[i] = f"I-{label}"
|
||||
biluo[end_token] = f"L-{label}"
|
||||
# Now distinguish the O cases from ones where we miss the tokenization
|
||||
entity_chars = set()
|
||||
for start_char, end_char, label in entities:
|
||||
for i in range(start_char, end_char):
|
||||
entity_chars.add(i)
|
||||
for token in doc:
|
||||
for i in range(token.idx, token.idx + len(token)):
|
||||
if i in entity_chars:
|
||||
break
|
||||
else:
|
||||
biluo[token.i] = missing
|
||||
if "-" in biluo and missing != "-":
|
||||
ent_str = str(entities)
|
||||
warnings.warn(
|
||||
Warnings.W030.format(
|
||||
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
|
||||
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
|
||||
)
|
||||
)
|
||||
return biluo
|
||||
|
||||
|
||||
def spans_from_biluo_tags(doc, tags):
|
||||
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
||||
to overwrite the doc.ents.
|
||||
|
||||
doc (Doc): The document that the BILUO tags refer to.
|
||||
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||
token. Each tags string will be of the form of either "", "O" or
|
||||
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||
RETURNS (list): A sequence of Span objects.
|
||||
"""
|
||||
token_offsets = tags_to_entities(tags)
|
||||
spans = []
|
||||
for label, start_idx, end_idx in token_offsets:
|
||||
span = Span(doc, start_idx, end_idx + 1, label=label)
|
||||
spans.append(span)
|
||||
return spans
|
||||
|
||||
|
||||
def offsets_from_biluo_tags(doc, tags):
|
||||
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||
|
||||
doc (Doc): The document that the BILUO tags refer to.
|
||||
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||
token. Each tags string will be of the form of either "", "O" or
|
||||
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
|
||||
`end` will be character-offset integers denoting the slice into the
|
||||
original string.
|
||||
"""
|
||||
spans = spans_from_biluo_tags(doc, tags)
|
||||
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
""" Note that the end index returned by this function is inclusive.
|
||||
To use it for Span creation, increment the end by 1."""
|
||||
entities = []
|
||||
start = None
|
||||
for i, tag in enumerate(tags):
|
||||
if tag is None:
|
||||
continue
|
||||
if tag.startswith("O"):
|
||||
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
||||
if start is not None:
|
||||
start = None
|
||||
else:
|
||||
entities.append(("", i, i))
|
||||
continue
|
||||
elif tag == "-":
|
||||
continue
|
||||
elif tag.startswith("I"):
|
||||
if start is None:
|
||||
raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
|
||||
continue
|
||||
if tag.startswith("U"):
|
||||
entities.append((tag[2:], i, i))
|
||||
elif tag.startswith("B"):
|
||||
start = i
|
||||
elif tag.startswith("L"):
|
||||
entities.append((tag[2:], start, i))
|
||||
start = None
|
||||
else:
|
||||
raise ValueError(Errors.E068.format(tag=tag))
|
||||
return entities
|
|
@ -1,15 +1,15 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
|
||||
from .vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from .structs cimport KBEntryC, AliasC
|
||||
|
||||
|
||||
ctypedef vector[KBEntryC] entry_vec
|
||||
ctypedef vector[AliasC] alias_vec
|
||||
ctypedef vector[float] float_vec
|
||||
|
|
13
spacy/kb.pyx
13
spacy/kb.pyx
|
@ -1,6 +1,4 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
# coding: utf8
|
||||
# cython: infer_types=True, profile=True
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
|
@ -8,12 +6,11 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
|||
from libc.stdint cimport int32_t, int64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from .errors import Errors, Warnings
|
||||
|
||||
|
||||
|
@ -41,7 +38,7 @@ cdef class Candidate:
|
|||
|
||||
@property
|
||||
def entity_(self):
|
||||
"""RETURNS (unicode): ID/name of this entity in the KB"""
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
|
@ -51,7 +48,7 @@ cdef class Candidate:
|
|||
|
||||
@property
|
||||
def alias_(self):
|
||||
"""RETURNS (unicode): ID of the original alias"""
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
|
@ -445,6 +442,8 @@ cdef class KnowledgeBase:
|
|||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc), f"{loc} is directory"
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if path.exists(loc):
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user