mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
Merge branch 'develop' into master-tmp
This commit is contained in:
commit
52728d8fa3
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -44,6 +44,7 @@ __pycache__/
|
|||
.env*
|
||||
.~env/
|
||||
.venv
|
||||
env3.6/
|
||||
venv/
|
||||
env3.*/
|
||||
.dev
|
||||
|
@ -118,3 +119,6 @@ Desktop.ini
|
|||
|
||||
# Pycharm project files
|
||||
*.idea
|
||||
|
||||
# IPython
|
||||
.ipynb_checkpoints/
|
||||
|
|
23
.travis.yml
23
.travis.yml
|
@ -1,23 +0,0 @@
|
|||
language: python
|
||||
sudo: false
|
||||
cache: pip
|
||||
dist: trusty
|
||||
group: edge
|
||||
python:
|
||||
- "2.7"
|
||||
os:
|
||||
- linux
|
||||
install:
|
||||
- "pip install -r requirements.txt"
|
||||
- "python setup.py build_ext --inplace"
|
||||
- "pip install -e ."
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "python -m pytest --tb=native spacy"
|
||||
branches:
|
||||
except:
|
||||
- spacy.io
|
||||
notifications:
|
||||
slack:
|
||||
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
|
||||
email: false
|
|
@ -280,23 +280,7 @@ except: # noqa: E722
|
|||
|
||||
### Python conventions
|
||||
|
||||
All Python code must be written in an **intersection of Python 2 and Python 3**.
|
||||
This is easy in Cython, but somewhat ugly in Python. Logic that deals with
|
||||
Python or platform compatibility should only live in
|
||||
[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin
|
||||
functions, replacement functions are suffixed with an underscore, for example
|
||||
`unicode_`. If you need to access the user's version or platform information,
|
||||
for example to show more specific error messages, you can use the `is_config()`
|
||||
helper function.
|
||||
|
||||
```python
|
||||
from .compat import unicode_, is_config
|
||||
|
||||
compatible_unicode = unicode_('hello world')
|
||||
if is_config(windows=True, python2=True):
|
||||
print("You are using Python 2 on Windows.")
|
||||
```
|
||||
|
||||
All Python code must be written **compatible with Python 3.6+**.
|
||||
Code that interacts with the file-system should accept objects that follow the
|
||||
`pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`.
|
||||
If the function is user-facing and takes a path as an argument, it should check
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
recursive-include include *.h
|
||||
recursive-include spacy *.txt *.pyx *.pxd
|
||||
recursive-include spacy *.pyx *.pxd *.txt *.cfg
|
||||
include LICENSE
|
||||
include README.md
|
||||
include bin/spacy
|
||||
|
|
4
Makefile
4
Makefile
|
@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
|
|||
version := $(shell "bin/get-version.sh")
|
||||
|
||||
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core
|
||||
chmod a+rx $@
|
||||
cp $@ dist/spacy.pex
|
||||
|
||||
|
@ -15,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
|
|||
|
||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
||||
$(VENV)/bin/pip wheel . -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel jsonschema spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse
|
||||
touch $@
|
||||
|
||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
|
||||
|
|
14
README.md
14
README.md
|
@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license.
|
|||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](<https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build+(3.x)>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[![Travis Build Status](<https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis-ci&logoColor=white&label=build+(2.7)>)](https://travis-ci.org/explosion/spaCy)
|
||||
[![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases)
|
||||
[![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/)
|
||||
[![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy)
|
||||
|
@ -98,12 +97,19 @@ For detailed installation instructions, see the
|
|||
|
||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||
Studio)
|
||||
- **Python version**: Python 2.7, 3.5+ (only 64 bit)
|
||||
- **Python version**: Python 3.6+ (only 64 bit)
|
||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||
|
||||
[pip]: https://pypi.org/project/spacy/
|
||||
[conda]: https://anaconda.org/conda-forge/spacy
|
||||
|
||||
> ⚠️ **Important note for Python 3.8:** We can't yet ship pre-compiled binary
|
||||
> wheels for spaCy that work on Python 3.8, as we're still waiting for our CI
|
||||
> providers and other tooling to support it. This means that in order to run
|
||||
> spaCy on Python 3.8, you'll need [a compiler installed](#source) and compile
|
||||
> the library and its Cython dependencies locally. If this is causing problems
|
||||
> for you, the easiest solution is to **use Python 3.7** in the meantime.
|
||||
|
||||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels (as
|
||||
|
@ -263,9 +269,7 @@ and git preinstalled.
|
|||
Install a version of the
|
||||
[Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
|
||||
or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that
|
||||
matches the version that was used to compile your Python interpreter. For
|
||||
official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and
|
||||
VS 2015 (Python 3.5).
|
||||
matches the version that was used to compile your Python interpreter.
|
||||
|
||||
## Run tests
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ jobs:
|
|||
inputs:
|
||||
versionSpec: '3.7'
|
||||
- script: |
|
||||
pip install flake8
|
||||
pip install flake8==3.5.0
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
|
||||
displayName: 'flake8'
|
||||
|
||||
|
@ -35,12 +35,6 @@ jobs:
|
|||
dependsOn: 'Validate'
|
||||
strategy:
|
||||
matrix:
|
||||
Python35Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.5'
|
||||
Python35Windows:
|
||||
imageName: 'vs2017-win2016'
|
||||
python.version: '3.5'
|
||||
Python36Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
python.version: '3.6'
|
||||
|
@ -58,7 +52,7 @@ jobs:
|
|||
# imageName: 'vs2017-win2016'
|
||||
# python.version: '3.7'
|
||||
# Python37Mac:
|
||||
# imageName: 'macos-10.13'
|
||||
# imageName: 'macos-10.14'
|
||||
# python.version: '3.7'
|
||||
Python38Linux:
|
||||
imageName: 'ubuntu-16.04'
|
||||
|
|
169
bin/cythonize.py
169
bin/cythonize.py
|
@ -1,169 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
""" cythonize.py
|
||||
|
||||
Cythonize pyx files into C++ files as needed.
|
||||
|
||||
Usage: cythonize.py [root]
|
||||
|
||||
Checks pyx files to see if they have been changed relative to their
|
||||
corresponding C++ files. If they have, then runs cython on these files to
|
||||
recreate the C++ files.
|
||||
|
||||
Additionally, checks pxd files and setup.py if they have been changed. If
|
||||
they have, rebuilds everything.
|
||||
|
||||
Change detection based on file hashes stored in JSON format.
|
||||
|
||||
For now, this script should be run by developers when changing Cython files
|
||||
and the resulting C++ files checked in, so that end-users (and Python-only
|
||||
developers) do not get the Cython dependencies.
|
||||
|
||||
Based upon:
|
||||
|
||||
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
|
||||
https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
|
||||
|
||||
Note: this script does not check any of the dependent C++ libraries.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
import subprocess
|
||||
import argparse
|
||||
|
||||
|
||||
HASH_FILE = "cythonize.json"
|
||||
|
||||
|
||||
def process_pyx(fromfile, tofile, language_level="-2"):
|
||||
print("Processing %s" % fromfile)
|
||||
try:
|
||||
from Cython.Compiler.Version import version as cython_version
|
||||
from distutils.version import LooseVersion
|
||||
|
||||
if LooseVersion(cython_version) < LooseVersion("0.19"):
|
||||
raise Exception("Require Cython >= 0.19")
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
flags = ["--fast-fail", language_level]
|
||||
if tofile.endswith(".cpp"):
|
||||
flags += ["--cplus"]
|
||||
|
||||
try:
|
||||
try:
|
||||
r = subprocess.call(
|
||||
["cython"] + flags + ["-o", tofile, fromfile], env=os.environ
|
||||
) # See Issue #791
|
||||
if r != 0:
|
||||
raise Exception("Cython failed")
|
||||
except OSError:
|
||||
# There are ways of installing Cython that don't result in a cython
|
||||
# executable on the path, see gh-2397.
|
||||
r = subprocess.call(
|
||||
[
|
||||
sys.executable,
|
||||
"-c",
|
||||
"import sys; from Cython.Compiler.Main import "
|
||||
"setuptools_main as main; sys.exit(main())",
|
||||
]
|
||||
+ flags
|
||||
+ ["-o", tofile, fromfile]
|
||||
)
|
||||
if r != 0:
|
||||
raise Exception("Cython failed")
|
||||
except OSError:
|
||||
raise OSError("Cython needs to be installed")
|
||||
|
||||
|
||||
def preserve_cwd(path, func, *args):
|
||||
orig_cwd = os.getcwd()
|
||||
try:
|
||||
os.chdir(path)
|
||||
func(*args)
|
||||
finally:
|
||||
os.chdir(orig_cwd)
|
||||
|
||||
|
||||
def load_hashes(filename):
|
||||
try:
|
||||
return json.load(open(filename))
|
||||
except (ValueError, IOError):
|
||||
return {}
|
||||
|
||||
|
||||
def save_hashes(hash_db, filename):
|
||||
with open(filename, "w") as f:
|
||||
f.write(json.dumps(hash_db))
|
||||
|
||||
|
||||
def get_hash(path):
|
||||
return hashlib.md5(open(path, "rb").read()).hexdigest()
|
||||
|
||||
|
||||
def hash_changed(base, path, db):
|
||||
full_path = os.path.normpath(os.path.join(base, path))
|
||||
return not get_hash(full_path) == db.get(full_path)
|
||||
|
||||
|
||||
def hash_add(base, path, db):
|
||||
full_path = os.path.normpath(os.path.join(base, path))
|
||||
db[full_path] = get_hash(full_path)
|
||||
|
||||
|
||||
def process(base, filename, db):
|
||||
root, ext = os.path.splitext(filename)
|
||||
if ext in [".pyx", ".cpp"]:
|
||||
if hash_changed(base, filename, db) or not os.path.isfile(
|
||||
os.path.join(base, root + ".cpp")
|
||||
):
|
||||
preserve_cwd(base, process_pyx, root + ".pyx", root + ".cpp")
|
||||
hash_add(base, root + ".cpp", db)
|
||||
hash_add(base, root + ".pyx", db)
|
||||
|
||||
|
||||
def check_changes(root, db):
|
||||
res = False
|
||||
new_db = {}
|
||||
|
||||
setup_filename = "setup.py"
|
||||
hash_add(".", setup_filename, new_db)
|
||||
if hash_changed(".", setup_filename, db):
|
||||
res = True
|
||||
|
||||
for base, _, files in os.walk(root):
|
||||
for filename in files:
|
||||
if filename.endswith(".pxd"):
|
||||
hash_add(base, filename, new_db)
|
||||
if hash_changed(base, filename, db):
|
||||
res = True
|
||||
|
||||
if res:
|
||||
db.clear()
|
||||
db.update(new_db)
|
||||
return res
|
||||
|
||||
|
||||
def run(root):
|
||||
db = load_hashes(HASH_FILE)
|
||||
|
||||
try:
|
||||
check_changes(root, db)
|
||||
for base, _, files in os.walk(root):
|
||||
for filename in files:
|
||||
process(base, filename, db)
|
||||
finally:
|
||||
save_hashes(db, HASH_FILE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cythonize pyx files into C++ files as needed"
|
||||
)
|
||||
parser.add_argument("root", help="root directory")
|
||||
args = parser.parse_args()
|
||||
run(args.root)
|
|
@ -13,23 +13,12 @@ import srsly
|
|||
import spacy
|
||||
import spacy.util
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.util import compounding, minibatch_by_words
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
# from spacy.morphology import Fused_begin, Fused_inside
|
||||
from spacy import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
|
||||
Fused_begin = None
|
||||
Fused_inside = None
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import numpy.random
|
||||
|
||||
from . import conll17_ud_eval
|
||||
|
||||
from spacy import lang
|
||||
|
@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
|
|||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
def initialize_pipeline(nlp, examples, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
return nlp
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ import spacy
|
|||
import spacy.util
|
||||
from bin.ud import conll17_ud_eval
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import GoldParse, Example
|
||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -53,7 +53,7 @@ def read_data(
|
|||
max_doc_length=None,
|
||||
limit=None,
|
||||
):
|
||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
"""Read the CONLLU format into Example objects. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True."""
|
||||
|
@ -98,15 +98,16 @@ def read_data(
|
|||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
if raw_text and sent_annots:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
|
||||
def _parse_morph_string(morph_string):
|
||||
if morph_string == '_':
|
||||
|
@ -120,6 +121,7 @@ def _parse_morph_string(morph_string):
|
|||
output.append('%s_%s' % (key, value.lower()))
|
||||
return set(output)
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
docs = []
|
||||
sent = []
|
||||
|
@ -180,16 +182,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
|||
#############################
|
||||
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||
def golds_to_gold_data(docs, golds):
|
||||
"""Get out the training data format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
tuples = []
|
||||
data = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
||||
tuples.append((text, sents))
|
||||
return tuples
|
||||
example = Example(doc=doc)
|
||||
example.add_doc_annotation(cats=gold.cats)
|
||||
token_annotation_dict = gold.orig.to_dict()
|
||||
example.add_token_annotation(**token_annotation_dict)
|
||||
example.goldparse = gold
|
||||
data.append(example)
|
||||
return data
|
||||
|
||||
|
||||
##############
|
||||
|
@ -327,7 +331,6 @@ def get_token_conllu(token, i):
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
@ -348,7 +351,7 @@ def load_nlp(corpus, config, vectors=None):
|
|||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
def initialize_pipeline(nlp, examples, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
|
||||
nlp.add_pipe(nlp.create_pipe("morphologizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
|
@ -356,14 +359,15 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
|||
nlp.parser.add_multitask_objective("tag")
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective("sent_start")
|
||||
for gold in golds:
|
||||
for ex in examples:
|
||||
gold = ex.gold
|
||||
for tag in gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
if torch is not None and device != -1:
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
optimizer = nlp.begin_training(
|
||||
lambda: golds_to_gold_tuples(docs, golds),
|
||||
lambda: examples,
|
||||
device=device,
|
||||
subword_features=config.subword_features,
|
||||
conv_depth=config.conv_depth,
|
||||
|
@ -382,8 +386,8 @@ def _load_pretrained_tok2vec(nlp, loc):
|
|||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
if hasattr(component, "model") and component.model.has_ref("tok2vec"):
|
||||
component.get_ref("tok2vec").from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
||||
|
@ -491,6 +495,10 @@ def main(
|
|||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
@ -505,7 +513,7 @@ def main(
|
|||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||
|
||||
docs, golds = read_data(
|
||||
examples = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
|
@ -513,12 +521,12 @@ def main(
|
|||
limit=limit,
|
||||
)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
||||
optimizer = initialize_pipeline(nlp, examples, config, gpu_device)
|
||||
|
||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||
for i in range(config.nr_epoch):
|
||||
docs, golds = read_data(
|
||||
examples = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
|
@ -527,22 +535,19 @@ def main(
|
|||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments,
|
||||
)
|
||||
Xs = list(zip(docs, golds))
|
||||
random.shuffle(Xs)
|
||||
random.shuffle(examples)
|
||||
if config.batch_by_words:
|
||||
batches = minibatch_by_words(Xs, size=batch_sizes)
|
||||
batches = minibatch_by_words(examples, size=batch_sizes)
|
||||
else:
|
||||
batches = minibatch(Xs, size=batch_sizes)
|
||||
batches = minibatch(examples, size=batch_sizes)
|
||||
losses = {}
|
||||
n_train_words = sum(len(doc) for doc in docs)
|
||||
n_train_words = sum(len(ex.doc) for ex in examples)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||
nlp.update(
|
||||
batch_docs,
|
||||
batch_gold,
|
||||
batch,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
|
|
|
@ -14,7 +14,7 @@ pip install keras==2.0.9
|
|||
|
||||
Compatible with: spaCy v2.0.0+
|
||||
"""
|
||||
|
||||
import ml_datasets
|
||||
import plac
|
||||
import random
|
||||
import pathlib
|
||||
|
@ -24,7 +24,6 @@ from keras.models import Sequential, model_from_json
|
|||
from keras.layers import LSTM, Dense, Embedding, Bidirectional
|
||||
from keras.layers import TimeDistributed
|
||||
from keras.optimizers import Adam
|
||||
import thinc.extra.datasets
|
||||
from spacy.compat import pickle
|
||||
import spacy
|
||||
|
||||
|
@ -224,7 +223,7 @@ def main(
|
|||
if model_dir is not None:
|
||||
model_dir = pathlib.Path(model_dir)
|
||||
if train_dir is None or dev_dir is None:
|
||||
imdb_data = thinc.extra.datasets.imdb()
|
||||
imdb_data = ml_datasets.imdb()
|
||||
if is_runtime:
|
||||
if dev_dir is None:
|
||||
dev_texts, dev_labels = zip(*imdb_data[1])
|
||||
|
|
126
examples/experiments/onto-joint/defaults.cfg
Normal file
126
examples/experiments/onto-joint/defaults.cfg
Normal file
|
@ -0,0 +1,126 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 0
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 400
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
omit_extra_lookups = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
#[optimizer.learn_rate]
|
||||
#@schedules = "warmup_linear.v1"
|
||||
#warmup_steps = 250
|
||||
#total_steps = 20000
|
||||
#initial_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.senter]
|
||||
factory = "senter"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.senter.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.senter.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 256
|
||||
depth = 6
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
145
examples/experiments/onto-joint/pretrain.cfg
Normal file
145
examples/experiments/onto-joint/pretrain.cfg
Normal file
|
@ -0,0 +1,145 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 0
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 400
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[pretraining]
|
||||
max_epochs = 1000
|
||||
min_length = 5
|
||||
max_length = 500
|
||||
dropout = 0.2
|
||||
n_save_every = null
|
||||
batch_size = 3000
|
||||
seed = ${training:seed}
|
||||
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
|
||||
tok2vec_model = "nlp.pipeline.tok2vec.model"
|
||||
|
||||
[pretraining.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
[pretraining.loss_func]
|
||||
@losses = "CosineDistance.v1"
|
||||
normalize = true
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.senter]
|
||||
factory = "senter"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.senter.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.senter.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 256
|
||||
depth = 6
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
74
examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
Normal file
74
examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
Normal file
|
@ -0,0 +1,74 @@
|
|||
[training]
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
dropout = 0.2
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = 0
|
||||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedBiLSTM.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
subword_features = true
|
||||
maxout_pieces = 3
|
||||
dropout = null
|
75
examples/experiments/ptb-joint-pos-dep/defaults.cfg
Normal file
75
examples/experiments/ptb-joint-pos-dep/defaults.cfg
Normal file
|
@ -0,0 +1,75 @@
|
|||
[training]
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
dropout = 0.2
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = -1
|
||||
scores = ["tags_acc", "uas", "las"]
|
||||
score_weights = {"las": 0.8, "tags_acc": 0.2}
|
||||
limit = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
learn_tokens = false
|
||||
min_action_freq = 1
|
||||
beam_width = 1
|
||||
beam_update_prob = 1.0
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 96
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
dropout = null
|
69
examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
Normal file
69
examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
Normal file
|
@ -0,0 +1,69 @@
|
|||
[training]
|
||||
use_gpu = -1
|
||||
limit = 0
|
||||
dropout = 0.2
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
scores = ["ents_f"]
|
||||
score_weights = {"ents_f": 1}
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
batch_size = 25
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.Tok2Vec.v1"
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract]
|
||||
@architectures = "spacy.CharacterEmbed.v1"
|
||||
width = 96
|
||||
nM = 64
|
||||
nC = 8
|
||||
rows = 2000
|
||||
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||
dropout = null
|
||||
|
||||
[nlp.pipeline.tok2vec.model.extract.features]
|
||||
@architectures = "spacy.Doc2Feats.v1"
|
||||
columns = ${nlp.pipeline.tok2vec.model.extract:columns}
|
||||
|
||||
[nlp.pipeline.tok2vec.model.embed]
|
||||
@architectures = "spacy.LayerNormalizedMaxout.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
||||
maxout_pieces = 4
|
||||
|
||||
[nlp.pipeline.tok2vec.model.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
||||
window_size = 1
|
||||
maxout_pieces = 2
|
||||
depth = 2
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model.extract:width}
|
48
examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
Normal file
48
examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
Normal file
|
@ -0,0 +1,48 @@
|
|||
[training]
|
||||
use_gpu = -1
|
||||
limit = 0
|
||||
dropout = 0.2
|
||||
patience = 10000
|
||||
eval_frequency = 200
|
||||
scores = ["ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1}
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
seed = 0
|
||||
accumulate_gradient = 2
|
||||
discard_oversize = false
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 3000
|
||||
stop = 3000
|
||||
compound = 1.001
|
||||
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "simple_ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.BiluoTagger.v1"
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
width = 128
|
||||
depth = 4
|
||||
embed_size = 7000
|
||||
maxout_pieces = 3
|
||||
window_size = 1
|
||||
subword_features = true
|
||||
pretrained_vectors = null
|
||||
dropout = null
|
|
@ -13,9 +13,10 @@ Prerequisites: pip install joblib
|
|||
from __future__ import print_function, unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import ml_datasets
|
||||
from joblib import Parallel, delayed
|
||||
from functools import partial
|
||||
import thinc.extra.datasets
|
||||
import plac
|
||||
import spacy
|
||||
from spacy.util import minibatch
|
||||
|
@ -35,7 +36,7 @@ def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10
|
|||
output_dir.mkdir()
|
||||
# load and pre-process the IMBD dataset
|
||||
print("Loading IMDB data...")
|
||||
data, _ = thinc.extra.datasets.imdb()
|
||||
data, _ = ml_datasets.imdb()
|
||||
texts, _ = zip(*data[-limit:])
|
||||
print("Processing texts...")
|
||||
partitions = minibatch(texts, size=batch_size)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Example of a Streamlit app for an interactive spaCy model visualizer. You can
|
||||
either download the script, or point streamlit run to the raw URL of this
|
||||
either download the script, or point `streamlit run` to the raw URL of this
|
||||
file. For more details, see https://streamlit.io.
|
||||
|
||||
Installation:
|
||||
|
@ -15,6 +15,8 @@ streamlit run streamlit_spacy.py
|
|||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import base64
|
||||
|
||||
import streamlit as st
|
||||
import spacy
|
||||
from spacy import displacy
|
||||
|
@ -54,6 +56,14 @@ model_load_state.empty()
|
|||
text = st.text_area("Text to analyze", DEFAULT_TEXT)
|
||||
doc = process_text(spacy_model, text)
|
||||
|
||||
|
||||
def render_svg(svg):
|
||||
"""Renders the given svg string."""
|
||||
b64 = base64.b64encode(svg.encode('utf-8')).decode("utf-8")
|
||||
html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
|
||||
st.write(html, unsafe_allow_html=True)
|
||||
|
||||
|
||||
if "parser" in nlp.pipe_names:
|
||||
st.header("Dependency Parse & Part-of-speech tags")
|
||||
st.sidebar.header("Dependency Parse")
|
||||
|
@ -68,12 +78,14 @@ if "parser" in nlp.pipe_names:
|
|||
}
|
||||
docs = [span.as_doc() for span in doc.sents] if split_sents else [doc]
|
||||
for sent in docs:
|
||||
html = displacy.render(sent, options=options)
|
||||
html = displacy.render(sent, options=options, style="dep")
|
||||
# Double newlines seem to mess with the rendering
|
||||
html = html.replace("\n\n", "\n")
|
||||
if split_sents and len(docs) > 1:
|
||||
st.markdown(f"> {sent.text}")
|
||||
st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||
render_svg(html)
|
||||
# this didn't show the dep arc labels properly, cf #5089
|
||||
# st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
|
||||
|
||||
if "ner" in nlp.pipe_names:
|
||||
st.header("Named Entities")
|
||||
|
|
|
@ -12,7 +12,7 @@ import tqdm
|
|||
import spacy
|
||||
import spacy.util
|
||||
from spacy.tokens import Token, Doc
|
||||
from spacy.gold import GoldParse
|
||||
from spacy.gold import GoldParse, Example
|
||||
from spacy.syntax.nonproj import projectivize
|
||||
from collections import defaultdict
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -33,25 +33,25 @@ random.seed(0)
|
|||
numpy.random.seed(0)
|
||||
|
||||
|
||||
def minibatch_by_words(items, size=5000):
|
||||
random.shuffle(items)
|
||||
def minibatch_by_words(examples, size=5000):
|
||||
random.shuffle(examples)
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
else:
|
||||
size_ = size
|
||||
items = iter(items)
|
||||
examples = iter(examples)
|
||||
while True:
|
||||
batch_size = next(size_)
|
||||
batch = []
|
||||
while batch_size >= 0:
|
||||
try:
|
||||
doc, gold = next(items)
|
||||
example = next(examples)
|
||||
except StopIteration:
|
||||
if batch:
|
||||
yield batch
|
||||
return
|
||||
batch_size -= len(doc)
|
||||
batch.append((doc, gold))
|
||||
batch_size -= len(example.doc)
|
||||
batch.append(example)
|
||||
if batch:
|
||||
yield batch
|
||||
else:
|
||||
|
@ -78,7 +78,7 @@ def read_data(
|
|||
max_doc_length=None,
|
||||
limit=None,
|
||||
):
|
||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
||||
"""Read the CONLLU format into Example objects. If raw_text=True,
|
||||
include Doc objects created using nlp.make_doc and then aligned against
|
||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||
created from the gold-standard segments. At least one must be True."""
|
||||
|
@ -119,15 +119,15 @@ def read_data(
|
|||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
if raw_text and sent_annots:
|
||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||
docs.append(doc)
|
||||
golds.append(gold)
|
||||
if limit and len(docs) >= limit:
|
||||
return docs, golds
|
||||
return docs, golds
|
||||
return golds_to_gold_data(docs, golds)
|
||||
return golds_to_gold_data(docs, golds)
|
||||
|
||||
|
||||
def read_conllu(file_):
|
||||
|
@ -181,16 +181,18 @@ def _make_gold(nlp, text, sent_annots):
|
|||
#############################
|
||||
|
||||
|
||||
def golds_to_gold_tuples(docs, golds):
|
||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
||||
def golds_to_gold_data(docs, golds):
|
||||
"""Get out the training data format used by begin_training, given the
|
||||
GoldParse objects."""
|
||||
tuples = []
|
||||
data = []
|
||||
for doc, gold in zip(docs, golds):
|
||||
text = doc.text
|
||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
||||
tuples.append((text, sents))
|
||||
return tuples
|
||||
example = Example(doc=doc)
|
||||
example.add_doc_annotation(cats=gold.cats)
|
||||
token_annotation_dict = gold.orig.to_dict()
|
||||
example.add_token_annotation(**token_annotation_dict)
|
||||
example.goldparse = gold
|
||||
data.append(example)
|
||||
return data
|
||||
|
||||
|
||||
##############
|
||||
|
@ -303,7 +305,7 @@ def load_nlp(corpus, config):
|
|||
return nlp
|
||||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config):
|
||||
def initialize_pipeline(nlp, examples, config):
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
if config.multitask_tag:
|
||||
nlp.parser.add_multitask_objective("tag")
|
||||
|
@ -311,18 +313,19 @@ def initialize_pipeline(nlp, docs, golds, config):
|
|||
nlp.parser.add_multitask_objective("sent_start")
|
||||
nlp.parser.moves.add_action(2, "subtok")
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
for ex in examples:
|
||||
for tag in ex.gold.tags:
|
||||
if tag is not None:
|
||||
nlp.tagger.add_label(tag)
|
||||
# Replace labels that didn't make the frequency cutoff
|
||||
actions = set(nlp.parser.labels)
|
||||
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
||||
for gold in golds:
|
||||
for ex in examples:
|
||||
gold = ex.gold
|
||||
for i, label in enumerate(gold.labels):
|
||||
if label is not None and label not in label_set:
|
||||
gold.labels[i] = label.split("||")[0]
|
||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
||||
return nlp.begin_training(lambda: examples)
|
||||
|
||||
|
||||
########################
|
||||
|
@ -391,13 +394,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
||||
Token.set_extension("begins_fused", default=False)
|
||||
Token.set_extension("inside_fused", default=False)
|
||||
|
||||
paths = TreebankPaths(ud_dir, corpus)
|
||||
if not (parses_dir / corpus).exists():
|
||||
(parses_dir / corpus).mkdir()
|
||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||
nlp = load_nlp(paths.lang, config)
|
||||
|
||||
docs, golds = read_data(
|
||||
examples = read_data(
|
||||
nlp,
|
||||
paths.train.conllu.open(encoding="utf8"),
|
||||
paths.train.text.open(encoding="utf8"),
|
||||
|
@ -405,23 +412,18 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
|||
limit=limit,
|
||||
)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
||||
optimizer = initialize_pipeline(nlp, examples, config)
|
||||
|
||||
for i in range(config.nr_epoch):
|
||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
||||
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
|
||||
docs = [nlp.make_doc(example.doc.text) for example in examples]
|
||||
batches = minibatch_by_words(examples, size=config.batch_size)
|
||||
losses = {}
|
||||
n_train_words = sum(len(doc) for doc in docs)
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
for batch in batches:
|
||||
batch_docs, batch_gold = zip(*batch)
|
||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
||||
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||
nlp.update(
|
||||
batch_docs,
|
||||
batch_gold,
|
||||
sgd=optimizer,
|
||||
drop=config.dropout,
|
||||
losses=losses,
|
||||
examples=batch, sgd=optimizer, drop=config.dropout, losses=losses,
|
||||
)
|
||||
|
||||
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
|
||||
|
|
|
@ -31,14 +31,13 @@ random.seed(0)
|
|||
|
||||
PWD = os.path.dirname(__file__)
|
||||
|
||||
TRAIN_DATA = list(read_json_file(
|
||||
os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
|
||||
TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
|
||||
|
||||
|
||||
def get_position_label(i, words, tags, heads, labels, ents):
|
||||
def get_position_label(i, token_annotation):
|
||||
"""Return labels indicating the position of the word in the document.
|
||||
"""
|
||||
if len(words) < 20:
|
||||
if len(token_annotation.words) < 20:
|
||||
return "short-doc"
|
||||
elif i == 0:
|
||||
return "first-word"
|
||||
|
@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
|
|||
return "early-word"
|
||||
elif i < 20:
|
||||
return "mid-word"
|
||||
elif i == len(words) - 1:
|
||||
elif i == len(token_annotation.words) - 1:
|
||||
return "last-word"
|
||||
else:
|
||||
return "late-word"
|
||||
|
@ -60,17 +59,17 @@ def main(n_iter=10):
|
|||
print(nlp.pipeline)
|
||||
|
||||
print("Create data", len(TRAIN_DATA))
|
||||
optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
|
||||
optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
for text, annot_brackets in TRAIN_DATA:
|
||||
for annotations, _ in annot_brackets:
|
||||
doc = Doc(nlp.vocab, words=annotations[1])
|
||||
gold = GoldParse.from_annot_tuples(doc, annotations)
|
||||
for example in TRAIN_DATA:
|
||||
for token_annotation in example.token_annotations:
|
||||
doc = Doc(nlp.vocab, words=token_annotation.words)
|
||||
gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
|
||||
|
||||
nlp.update(
|
||||
[doc], # batch of texts
|
||||
[gold], # batch of annotations
|
||||
examples=[(doc, gold)], # 1 example
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
sgd=optimizer, # callable to update weights
|
||||
losses=losses,
|
||||
|
@ -78,9 +77,9 @@ def main(n_iter=10):
|
|||
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
||||
|
||||
# test the trained model
|
||||
for text, _ in TRAIN_DATA:
|
||||
if text is not None:
|
||||
doc = nlp(text)
|
||||
for example in TRAIN_DATA:
|
||||
if example.text is not None:
|
||||
doc = nlp(example.text)
|
||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||
|
||||
|
|
|
@ -1,217 +0,0 @@
|
|||
"""This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pretrained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
||||
To evaluate the technique, we're pre-training with the 50k texts from the IMDB
|
||||
corpus, and then training with only 100 labels. Note that it's a bit dirty to
|
||||
pre-train with the development data, but also not *so* terrible: we're not using
|
||||
the development labels, after all --- only the unlabelled text.
|
||||
"""
|
||||
import plac
|
||||
import tqdm
|
||||
import random
|
||||
import spacy
|
||||
import thinc.extra.datasets
|
||||
from spacy.util import minibatch, use_gpu, compounding
|
||||
from spacy._ml import Tok2Vec
|
||||
from spacy.pipeline import TextCategorizer
|
||||
import numpy
|
||||
|
||||
|
||||
def load_texts(limit=0):
|
||||
train, dev = thinc.extra.datasets.imdb()
|
||||
train_texts, train_labels = zip(*train)
|
||||
dev_texts, dev_labels = zip(*train)
|
||||
train_texts = list(train_texts)
|
||||
dev_texts = list(dev_texts)
|
||||
random.shuffle(train_texts)
|
||||
random.shuffle(dev_texts)
|
||||
if limit >= 1:
|
||||
return train_texts[:limit]
|
||||
else:
|
||||
return list(train_texts) + list(dev_texts)
|
||||
|
||||
|
||||
def load_textcat_data(limit=0):
|
||||
"""Load data from the IMDB dataset."""
|
||||
# Partition off part of the train data for evaluation
|
||||
train_data, eval_data = thinc.extra.datasets.imdb()
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
texts, labels = zip(*train_data)
|
||||
eval_texts, eval_labels = zip(*eval_data)
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
eval_cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in eval_labels]
|
||||
return (texts, cats), (eval_texts, eval_cats)
|
||||
|
||||
|
||||
def prefer_gpu():
|
||||
used = spacy.util.use_gpu(0)
|
||||
if used is None:
|
||||
return False
|
||||
else:
|
||||
import cupy.random
|
||||
|
||||
cupy.random.seed(0)
|
||||
return True
|
||||
|
||||
|
||||
def build_textcat_model(tok2vec, nr_class, width):
|
||||
from thinc.v2v import Model, Softmax, Maxout
|
||||
from thinc.api import flatten_add_lengths, chain
|
||||
from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
|
||||
from thinc.misc import Residual, LayerNorm
|
||||
from spacy._ml import logistic, zero_init
|
||||
|
||||
with Model.define_operators({">>": chain}):
|
||||
model = (
|
||||
tok2vec
|
||||
>> flatten_add_lengths
|
||||
>> Pooling(mean_pool)
|
||||
>> Softmax(nr_class, width)
|
||||
)
|
||||
model.tok2vec = tok2vec
|
||||
return model
|
||||
|
||||
|
||||
def block_gradients(model):
|
||||
from thinc.api import wrap
|
||||
|
||||
def forward(X, drop=0.0):
|
||||
Y, _ = model.begin_update(X, drop=drop)
|
||||
return Y, None
|
||||
|
||||
return wrap(forward, model)
|
||||
|
||||
|
||||
def create_pipeline(width, embed_size, vectors_model):
|
||||
print("Load vectors")
|
||||
nlp = spacy.load(vectors_model)
|
||||
print("Start training")
|
||||
textcat = TextCategorizer(
|
||||
nlp.vocab,
|
||||
labels=["POSITIVE", "NEGATIVE"],
|
||||
model=build_textcat_model(
|
||||
Tok2Vec(width=width, embed_size=embed_size), 2, width
|
||||
),
|
||||
)
|
||||
|
||||
nlp.add_pipe(textcat)
|
||||
return nlp
|
||||
|
||||
|
||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||
tensorizer = nlp.create_pipe("tensorizer")
|
||||
nlp.add_pipe(tensorizer)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
||||
docs = [nlp.make_doc(text) for text in batch]
|
||||
tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
|
||||
print(losses)
|
||||
return optimizer
|
||||
|
||||
|
||||
def train_textcat(nlp, n_texts, n_iter=10):
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||
print(
|
||||
"Using {} examples ({} training, {} evaluation)".format(
|
||||
n_texts, len(train_texts), len(dev_texts)
|
||||
)
|
||||
)
|
||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||
optimizer = nlp.begin_training()
|
||||
textcat.model.tok2vec.from_bytes(tok2vec_weights)
|
||||
print("Training the model...")
|
||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||
for i in range(n_iter):
|
||||
losses = {"textcat": 0.0}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
# evaluate on the dev data split off in load_data()
|
||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||
print(
|
||||
"{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table
|
||||
losses["textcat"],
|
||||
scores["textcat_p"],
|
||||
scores["textcat_r"],
|
||||
scores["textcat_f"],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||
docs = (tokenizer(text) for text in texts)
|
||||
tp = 1e-8
|
||||
fp = 1e-8
|
||||
tn = 1e-8
|
||||
fn = 1e-8
|
||||
for i, doc in enumerate(textcat.pipe(docs)):
|
||||
gold = cats[i]
|
||||
for label, score in doc.cats.items():
|
||||
if label not in gold:
|
||||
continue
|
||||
if score >= 0.5 and gold[label] >= 0.5:
|
||||
tp += 1.0
|
||||
elif score >= 0.5 and gold[label] < 0.5:
|
||||
fp += 1.0
|
||||
elif score < 0.5 and gold[label] < 0.5:
|
||||
tn += 1
|
||||
elif score < 0.5 and gold[label] >= 0.5:
|
||||
fn += 1
|
||||
precision = tp / (tp + fp)
|
||||
recall = tp / (tp + fn)
|
||||
f_score = 2 * (precision * recall) / (precision + recall)
|
||||
return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
width=("Width of CNN layers", "positional", None, int),
|
||||
embed_size=("Embedding rows", "positional", None, int),
|
||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
||||
train_iters=("Number of iterations to train", "option", "tn", int),
|
||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
||||
vectors_model=("Name or path to vectors model to learn from"),
|
||||
)
|
||||
def main(
|
||||
width,
|
||||
embed_size,
|
||||
vectors_model,
|
||||
pretrain_iters=30,
|
||||
train_iters=30,
|
||||
train_examples=1000,
|
||||
):
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
use_gpu = prefer_gpu()
|
||||
print("Using GPU?", use_gpu)
|
||||
|
||||
nlp = create_pipeline(width, embed_size, vectors_model)
|
||||
print("Load data")
|
||||
texts = load_texts(limit=0)
|
||||
print("Train tensorizer")
|
||||
optimizer = train_tensorizer(nlp, texts, dropout=0.2, n_iter=pretrain_iters)
|
||||
print("Train textcat")
|
||||
train_textcat(nlp, train_examples, n_iter=train_iters)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -59,17 +59,14 @@ def main(model_name, unlabelled_loc):
|
|||
# yet, but I'm getting weird results from Adam. Try commenting out the
|
||||
# nlp.update(), and using Adam -- you'll find the models drift apart.
|
||||
# I guess Adam is losing precision, introducing gradient noise?
|
||||
optimizer.alpha = 0.1
|
||||
optimizer.learn_rate = 0.1
|
||||
optimizer.b1 = 0.0
|
||||
optimizer.b2 = 0.0
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
sizes = compounding(1.0, 4.0, 1.001)
|
||||
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
|
@ -79,8 +76,7 @@ def main(model_name, unlabelled_loc):
|
|||
# batch up the examples using spaCy's minibatch
|
||||
raw_batches = minibatch(raw_docs, size=4)
|
||||
for batch in minibatch(TRAIN_DATA, size=sizes):
|
||||
docs, golds = zip(*batch)
|
||||
nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
|
||||
raw_batch = list(next(raw_batches))
|
||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
|
||||
print("Losses", losses)
|
||||
|
|
|
@ -5,16 +5,17 @@ from spacy.gold import docs_to_json
|
|||
import srsly
|
||||
import sys
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
||||
input_file=("Input file (jsonl)", "positional", None, Path),
|
||||
output_dir=("Output directory", "positional", None, Path),
|
||||
n_texts=("Number of texts to convert", "option", "t", int),
|
||||
)
|
||||
def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
||||
def convert(model="en", input_file=None, output_dir=None, n_texts=0):
|
||||
# Load model with tokenizer + sentencizer only
|
||||
nlp = spacy.load(model)
|
||||
nlp.disable_pipes(*nlp.pipe_names)
|
||||
nlp.select_pipes(disable=nlp.pipe_names)
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
nlp.add_pipe(sentencizer, first=True)
|
||||
|
||||
|
@ -49,5 +50,6 @@ def convert(model='en', input_file=None, output_dir=None, n_texts=0):
|
|||
|
||||
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(convert)
|
||||
|
|
|
@ -18,7 +18,6 @@ import random
|
|||
from pathlib import Path
|
||||
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
@ -66,36 +65,38 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
vocab = Vocab().from_disk(vocab_path)
|
||||
# create blank English model with correct vocab
|
||||
nlp = spacy.blank("en", vocab=vocab)
|
||||
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
|
||||
nlp.vocab.vectors.name = "nel_vectors"
|
||||
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
|
||||
|
||||
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
|
||||
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
|
||||
# Note that in a realistic application, an actual NER algorithm should be used instead.
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
# Create the Entity Linker component and add it to the pipeline.
|
||||
if "entity_linker" not in nlp.pipe_names:
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
kb = KnowledgeBase(vocab=nlp.vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
print("Loaded Knowledge Base from '%s'" % kb_path)
|
||||
entity_linker.set_kb(kb)
|
||||
|
||||
# use only the predicted EL score and not the prior probability (for demo purposes)
|
||||
cfg = {"kb": kb, "incl_prior": False}
|
||||
entity_linker = nlp.create_pipe("entity_linker", cfg)
|
||||
nlp.add_pipe(entity_linker, last=True)
|
||||
|
||||
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
|
||||
# Also ensure that the annotated examples correspond to known identifiers in the knowledge base.
|
||||
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
|
||||
TRAIN_DOCS = []
|
||||
for text, annotation in TRAIN_DATA:
|
||||
with nlp.disable_pipes("entity_linker"):
|
||||
with nlp.select_pipes(disable="entity_linker"):
|
||||
doc = nlp(text)
|
||||
annotation_clean = annotation
|
||||
for offset, kb_id_dict in annotation["links"].items():
|
||||
|
@ -110,22 +111,18 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
|||
annotation_clean["links"][offset] = new_dict
|
||||
TRAIN_DOCS.append((doc, annotation_clean))
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train entity linker
|
||||
with nlp.select_pipes(enable="entity_linker"): # only train entity linker
|
||||
# reset and initialize the weights randomly
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DOCS)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
texts, # batch of texts
|
||||
annotations, # batch of annotations
|
||||
batch,
|
||||
drop=0.2, # dropout - make it harder to memorise data
|
||||
losses=losses,
|
||||
sgd=optimizer,
|
||||
|
|
|
@ -124,9 +124,7 @@ def main(model=None, output_dir=None, n_iter=15):
|
|||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
|
||||
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
||||
with nlp.select_pipes(enable="parser"): # only train parser
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
|
@ -134,8 +132,7 @@ def main(model=None, output_dir=None, n_iter=15):
|
|||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
133
examples/training/train_morphologizer.py
Normal file
133
examples/training/train_morphologizer.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
"""
|
||||
A simple example for training a morphologizer. For more details, see
|
||||
the documentation:
|
||||
* Training: https://spacy.io/usage/training
|
||||
|
||||
Compatible with: spaCy v3.0.0+
|
||||
Last tested with: v3.0.0
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import random
|
||||
from pathlib import Path
|
||||
import spacy
|
||||
from spacy.util import minibatch, compounding
|
||||
from spacy.morphology import Morphology
|
||||
|
||||
|
||||
# Usually you'll read this in, of course. Data formats vary. Ensure your
|
||||
# strings are unicode and that the number of tags assigned matches spaCy's
|
||||
# tokenization. If not, you can always add a 'words' key to the annotations
|
||||
# that specifies the gold-standard tokenization, e.g.:
|
||||
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"I like green eggs",
|
||||
{
|
||||
"morphs": [
|
||||
"PronType=Prs|Person=1",
|
||||
"VerbForm=Fin",
|
||||
"Degree=Pos",
|
||||
"Number=Plur",
|
||||
],
|
||||
"pos": ["PRON", "VERB", "ADJ", "NOUN"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"Eat blue ham",
|
||||
{
|
||||
"morphs": ["VerbForm=Inf", "Degree=Pos", "Number=Sing"],
|
||||
"pos": ["VERB", "ADJ", "NOUN"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"She was blue",
|
||||
{
|
||||
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos"],
|
||||
"pos": ["PRON", "VERB", "ADJ"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"He was blue today",
|
||||
{
|
||||
"morphs": ["PronType=Prs|Person=3", "VerbForm=Fin", "Degree=Pos", ""],
|
||||
"pos": ["PRON", "VERB", "ADJ", "ADV"],
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
# The POS tags are optional, set `with_pos_tags = False` to omit them for
|
||||
# this example:
|
||||
with_pos_tags = True
|
||||
|
||||
if not with_pos_tags:
|
||||
for i in range(len(TRAIN_DATA)):
|
||||
del TRAIN_DATA[i][1]["pos"]
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("ISO Code of language to use", "option", "l", str),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_iter=("Number of training iterations", "option", "n", int),
|
||||
)
|
||||
def main(lang="en", output_dir=None, n_iter=25):
|
||||
"""Create a new model, set up the pipeline and train the tagger. In order to
|
||||
train the tagger with a custom tag map, we're creating a new Language
|
||||
instance with a custom vocab.
|
||||
"""
|
||||
nlp = spacy.blank(lang)
|
||||
# add the tagger to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
morphologizer = nlp.create_pipe("morphologizer")
|
||||
nlp.add_pipe(morphologizer)
|
||||
|
||||
# add labels
|
||||
for _, annotations in TRAIN_DATA:
|
||||
morph_labels = annotations.get("morphs")
|
||||
pos_labels = annotations.get("pos", [""] * len(annotations.get("morphs")))
|
||||
assert len(morph_labels) == len(pos_labels)
|
||||
for morph, pos in zip(morph_labels, pos_labels):
|
||||
morph_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
morph_dict["POS"] = pos
|
||||
morph = Morphology.dict_to_feats(morph_dict)
|
||||
morphologizer.add_label(morph)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
print("Morphs", [(t.text, t.morph) for t in doc])
|
||||
|
||||
# save model to output directory
|
||||
if output_dir is not None:
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
print("Saved model to", output_dir)
|
||||
|
||||
# test the save model
|
||||
print("Loading from", output_dir)
|
||||
nlp2 = spacy.load(output_dir)
|
||||
doc = nlp2(test_text)
|
||||
print("Morphs", [(t.text, t.morph) for t in doc])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
||||
|
||||
# Expected output:
|
||||
# Morphs [('I', POS=PRON|Person=1|PronType=Prs), ('like', POS=VERB|VerbForm=Fin), ('blue', Degree=Pos|POS=ADJ), ('eggs', Number=Plur|POS=NOUN)]
|
|
@ -43,41 +43,39 @@ def main(model=None, output_dir=None, n_iter=100):
|
|||
|
||||
# create the built-in pipeline components and add them to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
if "ner" not in nlp.pipe_names:
|
||||
ner = nlp.create_pipe("ner")
|
||||
if "simple_ner" not in nlp.pipe_names:
|
||||
ner = nlp.create_pipe("simple_ner")
|
||||
nlp.add_pipe(ner, last=True)
|
||||
# otherwise, get it so we can add labels
|
||||
else:
|
||||
ner = nlp.get_pipe("ner")
|
||||
ner = nlp.get_pipe("simple_ner")
|
||||
|
||||
# add labels
|
||||
for _, annotations in TRAIN_DATA:
|
||||
for ent in annotations.get("entities"):
|
||||
print("Add label", ent[2])
|
||||
ner.add_label(ent[2])
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
# only train NER
|
||||
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||
|
||||
# reset and initialize the weights randomly – but only if we're
|
||||
# training a new model
|
||||
if model is None:
|
||||
nlp.begin_training()
|
||||
print(
|
||||
"Transitions", list(enumerate(nlp.get_pipe("simple_ner").get_tag_names()))
|
||||
)
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(
|
||||
texts, # batch of texts
|
||||
annotations, # batch of annotations
|
||||
drop=0.5, # dropout - make it harder to memorise data
|
||||
batch,
|
||||
drop=0.0, # dropout - make it harder to memorise data
|
||||
losses=losses,
|
||||
)
|
||||
print("Losses", losses)
|
||||
|
|
|
@ -95,13 +95,9 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
|||
else:
|
||||
optimizer = nlp.resume_training()
|
||||
move_names = list(ner.move_names)
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
# only train NER
|
||||
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
|
||||
with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
|
||||
# show warnings for misaligned entity spans once
|
||||
warnings.filterwarnings("once", category=UserWarning, module='spacy')
|
||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||
|
||||
sizes = compounding(1.0, 4.0, 1.001)
|
||||
# batch up the examples using spaCy's minibatch
|
||||
|
@ -110,8 +106,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
|||
batches = minibatch(TRAIN_DATA, size=sizes)
|
||||
losses = {}
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -64,10 +64,7 @@ def main(model=None, output_dir=None, n_iter=15):
|
|||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train parser
|
||||
with nlp.select_pipes(enable="parser"): # only train parser
|
||||
optimizer = nlp.begin_training()
|
||||
for itn in range(n_iter):
|
||||
random.shuffle(TRAIN_DATA)
|
||||
|
@ -75,8 +72,7 @@ def main(model=None, output_dir=None, n_iter=15):
|
|||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25):
|
|||
# batch up the examples using spaCy's minibatch
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
print("Losses", losses)
|
||||
|
||||
# test the trained model
|
||||
|
|
|
@ -2,89 +2,87 @@
|
|||
# coding: utf8
|
||||
"""Train a convolutional neural network text classifier on the
|
||||
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
|
||||
automatically via Thinc's built-in dataset loader. The model is added to
|
||||
automatically via the package `ml_datasets`. The model is added to
|
||||
spacy.pipeline, and predictions are available via `doc.cats`. For more details,
|
||||
see the documentation:
|
||||
* Training: https://spacy.io/usage/training
|
||||
|
||||
Compatible with: spaCy v2.0.0+
|
||||
Compatible with: spaCy v3.0.0+
|
||||
"""
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import plac
|
||||
import random
|
||||
from pathlib import Path
|
||||
import thinc.extra.datasets
|
||||
from ml_datasets import loaders
|
||||
|
||||
import spacy
|
||||
from spacy import util
|
||||
from spacy.util import minibatch, compounding
|
||||
from spacy.gold import Example, GoldParse
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||
config_path=("Path to config file", "positional", None, Path),
|
||||
output_dir=("Optional output directory", "option", "o", Path),
|
||||
n_texts=("Number of texts to train from", "option", "t", int),
|
||||
n_iter=("Number of training iterations", "option", "n", int),
|
||||
init_tok2vec=("Pretrained tok2vec weights", "option", "t2v", Path),
|
||||
dataset=("Dataset to train on (default: imdb)", "option", "d", str),
|
||||
threshold=("Min. number of instances for a given label (default 20)", "option", "m", int)
|
||||
)
|
||||
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
|
||||
def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None, dataset="imdb", threshold=20):
|
||||
if not config_path or not config_path.exists():
|
||||
raise ValueError(f"Config file not found at {config_path}")
|
||||
|
||||
spacy.util.fix_random_seed()
|
||||
if output_dir is not None:
|
||||
output_dir = Path(output_dir)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
|
||||
if model is not None:
|
||||
nlp = spacy.load(model) # load existing spaCy model
|
||||
print("Loaded model '%s'" % model)
|
||||
else:
|
||||
nlp = spacy.blank("en") # create blank Language class
|
||||
print("Created blank 'en' model")
|
||||
print(f"Loading nlp model from {config_path}")
|
||||
nlp_config = util.load_config(config_path, create_objects=False)["nlp"]
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
|
||||
# add the text classifier to the pipeline if it doesn't exist
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
# ensure the nlp object was defined with a textcat component
|
||||
if "textcat" not in nlp.pipe_names:
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
|
||||
)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# otherwise, get it, so we can add labels to it
|
||||
else:
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
raise ValueError(f"The nlp definition in the config does not contain a textcat component")
|
||||
|
||||
# add label to text classifier
|
||||
textcat.add_label("POSITIVE")
|
||||
textcat.add_label("NEGATIVE")
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
|
||||
# load the IMDB dataset
|
||||
print("Loading IMDB data...")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data()
|
||||
train_texts = train_texts[:n_texts]
|
||||
train_cats = train_cats[:n_texts]
|
||||
# load the dataset
|
||||
print(f"Loading dataset {dataset} ...")
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(dataset=dataset, threshold=threshold, limit=n_texts)
|
||||
print(
|
||||
"Using {} examples ({} training, {} evaluation)".format(
|
||||
n_texts, len(train_texts), len(dev_texts)
|
||||
)
|
||||
)
|
||||
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
|
||||
train_examples = []
|
||||
for text, cats in zip(train_texts, train_cats):
|
||||
doc = nlp.make_doc(text)
|
||||
gold = GoldParse(doc, cats=cats)
|
||||
for cat in cats:
|
||||
textcat.add_label(cat)
|
||||
ex = Example.from_gold(gold, doc=doc)
|
||||
train_examples.append(ex)
|
||||
|
||||
# get names of other pipes to disable them during training
|
||||
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
|
||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
|
||||
with nlp.disable_pipes(*other_pipes): # only train textcat
|
||||
with nlp.select_pipes(enable="textcat"): # only train textcat
|
||||
optimizer = nlp.begin_training()
|
||||
if init_tok2vec is not None:
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
textcat.model.tok2vec.from_bytes(file_.read())
|
||||
textcat.model.get_ref("tok2vec").from_bytes(file_.read())
|
||||
print("Training the model...")
|
||||
print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
|
||||
batch_sizes = compounding(4.0, 32.0, 1.001)
|
||||
for i in range(n_iter):
|
||||
losses = {}
|
||||
# batch up the examples using spaCy's minibatch
|
||||
random.shuffle(train_data)
|
||||
batches = minibatch(train_data, size=batch_sizes)
|
||||
random.shuffle(train_examples)
|
||||
batches = minibatch(train_examples, size=batch_sizes)
|
||||
for batch in batches:
|
||||
texts, annotations = zip(*batch)
|
||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
||||
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
||||
with textcat.model.use_params(optimizer.averages):
|
||||
# evaluate on the dev data split off in load_data()
|
||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||
|
@ -97,7 +95,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
|||
)
|
||||
)
|
||||
|
||||
# test the trained model
|
||||
# test the trained model (only makes sense for sentiment analysis)
|
||||
test_text = "This movie sucked"
|
||||
doc = nlp(test_text)
|
||||
print(test_text, doc.cats)
|
||||
|
@ -114,14 +112,48 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
|||
print(test_text, doc2.cats)
|
||||
|
||||
|
||||
def load_data(limit=0, split=0.8):
|
||||
"""Load data from the IMDB dataset."""
|
||||
def load_data(dataset, threshold, limit=0, split=0.8):
|
||||
"""Load data from the provided dataset."""
|
||||
# Partition off part of the train data for evaluation
|
||||
train_data, _ = thinc.extra.datasets.imdb()
|
||||
data_loader = loaders.get(dataset)
|
||||
train_data, _ = data_loader(limit=int(limit/split))
|
||||
random.shuffle(train_data)
|
||||
train_data = train_data[-limit:]
|
||||
texts, labels = zip(*train_data)
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
|
||||
unique_labels = set()
|
||||
for label_set in labels:
|
||||
if isinstance(label_set, int) or isinstance(label_set, str):
|
||||
unique_labels.add(label_set)
|
||||
elif isinstance(label_set, list) or isinstance(label_set, set):
|
||||
unique_labels.update(label_set)
|
||||
unique_labels = sorted(unique_labels)
|
||||
print(f"# of unique_labels: {len(unique_labels)}")
|
||||
|
||||
count_values_train = dict()
|
||||
for text, annot_list in train_data:
|
||||
if isinstance(annot_list, int) or isinstance(annot_list, str):
|
||||
count_values_train[annot_list] = count_values_train.get(annot_list, 0) + 1
|
||||
else:
|
||||
for annot in annot_list:
|
||||
count_values_train[annot] = count_values_train.get(annot, 0) + 1
|
||||
for value, count in sorted(count_values_train.items(), key=lambda item: item[1]):
|
||||
if count < threshold:
|
||||
unique_labels.remove(value)
|
||||
|
||||
print(f"# of unique_labels after filtering with threshold {threshold}: {len(unique_labels)}")
|
||||
|
||||
if unique_labels == {0, 1}:
|
||||
cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
|
||||
else:
|
||||
cats = []
|
||||
for y in labels:
|
||||
if isinstance(y, str) or isinstance(y, int):
|
||||
cats.append({str(label): (label == y) for label in unique_labels})
|
||||
elif isinstance(y, set):
|
||||
cats.append({str(label): (label in y) for label in unique_labels})
|
||||
else:
|
||||
raise ValueError(f"Unrecognised type of labels: {type(y)}")
|
||||
|
||||
split = int(len(train_data) * split)
|
||||
return (texts[:split], cats[:split]), (texts[split:], cats[split:])
|
||||
|
||||
|
|
19
examples/training/train_textcat_config.cfg
Normal file
19
examples/training/train_textcat_config.cfg
Normal file
|
@ -0,0 +1,19 @@
|
|||
[nlp]
|
||||
lang = "en"
|
||||
|
||||
[nlp.pipeline.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
[nlp.pipeline.textcat.model]
|
||||
@architectures = "spacy.TextCatCNN.v1"
|
||||
exclusive_classes = false
|
||||
|
||||
[nlp.pipeline.textcat.model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 96
|
||||
depth = 4
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
9
fabfile.py
vendored
9
fabfile.py
vendored
|
@ -1,9 +1,6 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
from fabric.api import local, lcd, env, settings, prefix
|
||||
from fabric.api import local, lcd
|
||||
from os import path, environ
|
||||
import shutil
|
||||
import sys
|
||||
|
@ -82,9 +79,7 @@ def pex():
|
|||
with virtualenv(VENV_DIR) as venv_local:
|
||||
with lcd(path.dirname(__file__)):
|
||||
sha = local("git rev-parse --short HEAD", capture=True)
|
||||
venv_local(
|
||||
"pex dist/*.whl -e spacy -o dist/spacy-%s.pex" % sha, direct=True
|
||||
)
|
||||
venv_local(f"pex dist/*.whl -e spacy -o dist/spacy-{sha}.pex", direct=True)
|
||||
|
||||
|
||||
def clean():
|
||||
|
|
|
@ -6,6 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc==7.4.1",
|
||||
"thinc==8.0.0a9",
|
||||
"blis>=0.4.0,<0.5.0"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -1,20 +1,23 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc==7.4.1
|
||||
thinc==8.0.0a9
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=1.0.2,<1.1.0
|
||||
srsly>=2.0.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.6.5
|
||||
|
|
20
setup.cfg
20
setup.cfg
|
@ -16,10 +16,7 @@ classifiers =
|
|||
Operating System :: MacOS :: MacOS X
|
||||
Operating System :: Microsoft :: Windows
|
||||
Programming Language :: Cython
|
||||
Programming Language :: Python :: 2
|
||||
Programming Language :: Python :: 2.7
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3.5
|
||||
Programming Language :: Python :: 3.6
|
||||
Programming Language :: Python :: 3.7
|
||||
Programming Language :: Python :: 3.8
|
||||
|
@ -30,32 +27,37 @@ zip_safe = false
|
|||
include_package_data = true
|
||||
scripts =
|
||||
bin/spacy
|
||||
python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*
|
||||
python_requires = >=3.6
|
||||
setup_requires =
|
||||
wheel
|
||||
cython>=0.25
|
||||
numpy>=1.15.0
|
||||
# We also need our Cython packages here to compile against
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc==7.4.1
|
||||
thinc==8.0.0a9
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc==7.4.1
|
||||
thinc==8.0.0a9
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=1.0.2,<1.1.0
|
||||
srsly>=2.0.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
ml_datasets>=0.1.1
|
||||
# Third-party dependencies
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
setuptools
|
||||
numpy>=1.15.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
|
|
180
setup.py
180
setup.py
|
@ -1,35 +1,27 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
import io
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import contextlib
|
||||
import platform
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.sysconfig import get_python_inc
|
||||
import distutils.util
|
||||
from distutils import ccompiler, msvccompiler
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
from Cython.Build import cythonize
|
||||
from Cython.Compiler import Options
|
||||
|
||||
|
||||
def is_new_osx():
|
||||
"""Check whether we're on OSX >= 10.10"""
|
||||
name = distutils.util.get_platform()
|
||||
if sys.platform != "darwin":
|
||||
return False
|
||||
elif name.startswith("macosx-10"):
|
||||
minor_version = int(name.split("-")[1].split(".")[1])
|
||||
if minor_version >= 7:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
ROOT = Path(__file__).parent
|
||||
PACKAGE_ROOT = ROOT / "spacy"
|
||||
|
||||
|
||||
# Preserve `__doc__` on functions and classes
|
||||
# http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
|
||||
Options.docstrings = True
|
||||
|
||||
PACKAGES = find_packages()
|
||||
|
||||
|
||||
MOD_NAMES = [
|
||||
"spacy.parts_of_speech",
|
||||
"spacy.strings",
|
||||
|
@ -62,16 +54,38 @@ MOD_NAMES = [
|
|||
"spacy.symbols",
|
||||
"spacy.vectors",
|
||||
]
|
||||
|
||||
|
||||
COMPILE_OPTIONS = {
|
||||
"msvc": ["/Ox", "/EHsc"],
|
||||
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
||||
}
|
||||
|
||||
|
||||
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
||||
COMPILER_DIRECTIVES = {
|
||||
"language_level": -3,
|
||||
"embedsignature": True,
|
||||
"annotation_typing": False,
|
||||
}
|
||||
# Files to copy into the package that are otherwise not included
|
||||
COPY_FILES = {
|
||||
ROOT / "setup.cfg": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "pyproject.toml": PACKAGE_ROOT / "tests" / "package",
|
||||
ROOT / "requirements.txt": PACKAGE_ROOT / "tests" / "package",
|
||||
}
|
||||
|
||||
|
||||
def is_new_osx():
|
||||
"""Check whether we're on OSX >= 10.7"""
|
||||
name = distutils.util.get_platform()
|
||||
if sys.platform != "darwin":
|
||||
return False
|
||||
mac_ver = platform.mac_ver()[0]
|
||||
if mac_ver.startswith("10"):
|
||||
minor_version = int(mac_ver.split('.')[1])
|
||||
if minor_version >= 7:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
if is_new_osx():
|
||||
|
@ -104,95 +118,53 @@ class build_ext_subclass(build_ext, build_ext_options):
|
|||
build_ext.build_extensions(self)
|
||||
|
||||
|
||||
def generate_cython(root, source):
|
||||
print("Cythonizing sources")
|
||||
p = subprocess.call(
|
||||
[sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
|
||||
env=os.environ,
|
||||
)
|
||||
if p != 0:
|
||||
raise RuntimeError("Running cythonize failed")
|
||||
|
||||
|
||||
def is_source_release(path):
|
||||
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
||||
|
||||
|
||||
def clean(path):
|
||||
for name in MOD_NAMES:
|
||||
name = name.replace(".", "/")
|
||||
for ext in [".so", ".html", ".cpp", ".c"]:
|
||||
file_path = os.path.join(path, name + ext)
|
||||
if os.path.exists(file_path):
|
||||
os.unlink(file_path)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def chdir(new_dir):
|
||||
old_dir = os.getcwd()
|
||||
try:
|
||||
os.chdir(new_dir)
|
||||
sys.path.insert(0, new_dir)
|
||||
yield
|
||||
finally:
|
||||
del sys.path[0]
|
||||
os.chdir(old_dir)
|
||||
for path in path.glob("**/*"):
|
||||
if path.is_file() and path.suffix in (".so", ".cpp"):
|
||||
print(f"Deleting {path.name}")
|
||||
path.unlink()
|
||||
|
||||
|
||||
def setup_package():
|
||||
root = os.path.abspath(os.path.dirname(__file__))
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "clean":
|
||||
return clean(root)
|
||||
return clean(PACKAGE_ROOT)
|
||||
|
||||
with chdir(root):
|
||||
with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
|
||||
about = {}
|
||||
exec(f.read(), about)
|
||||
with (PACKAGE_ROOT / "about.py").open("r") as f:
|
||||
about = {}
|
||||
exec(f.read(), about)
|
||||
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
os.path.join(root, "include"),
|
||||
]
|
||||
for copy_file, target_dir in COPY_FILES.items():
|
||||
if copy_file.exists():
|
||||
shutil.copy(str(copy_file), str(target_dir))
|
||||
print(f"Copied {copy_file} -> {target_dir}")
|
||||
|
||||
if (
|
||||
ccompiler.new_compiler().compiler_type == "msvc"
|
||||
and msvccompiler.get_build_version() == 9
|
||||
):
|
||||
include_dirs.append(os.path.join(root, "include", "msvc9"))
|
||||
include_dirs = [
|
||||
get_python_inc(plat_specific=True),
|
||||
numpy.get_include(),
|
||||
str(ROOT / "include"),
|
||||
]
|
||||
if (
|
||||
ccompiler.new_compiler().compiler_type == "msvc"
|
||||
and msvccompiler.get_build_version() == 9
|
||||
):
|
||||
include_dirs.append(str(ROOT / "include" / "msvc9"))
|
||||
ext_modules = []
|
||||
for name in MOD_NAMES:
|
||||
mod_path = name.replace(".", "/") + ".pyx"
|
||||
ext = Extension(name, [mod_path], language="c++")
|
||||
ext_modules.append(ext)
|
||||
print("Cythonizing sources")
|
||||
ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES)
|
||||
|
||||
ext_modules = []
|
||||
for mod_name in MOD_NAMES:
|
||||
mod_path = mod_name.replace(".", "/") + ".cpp"
|
||||
extra_link_args = []
|
||||
# ???
|
||||
# Imported from patch from @mikepb
|
||||
# See Issue #267. Running blind here...
|
||||
if sys.platform == "darwin":
|
||||
dylib_path = [".." for _ in range(mod_name.count("."))]
|
||||
dylib_path = "/".join(dylib_path)
|
||||
dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
|
||||
extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
|
||||
ext_modules.append(
|
||||
Extension(
|
||||
mod_name,
|
||||
[mod_path],
|
||||
language="c++",
|
||||
include_dirs=include_dirs,
|
||||
extra_link_args=extra_link_args,
|
||||
)
|
||||
)
|
||||
|
||||
if not is_source_release(root):
|
||||
generate_cython(root, "spacy")
|
||||
|
||||
setup(
|
||||
name="spacy",
|
||||
packages=PACKAGES,
|
||||
version=about["__version__"],
|
||||
ext_modules=ext_modules,
|
||||
cmdclass={"build_ext": build_ext_subclass},
|
||||
)
|
||||
setup(
|
||||
name="spacy",
|
||||
packages=PACKAGES,
|
||||
version=about["__version__"],
|
||||
ext_modules=ext_modules,
|
||||
cmdclass={"build_ext": build_ext_subclass},
|
||||
include_dirs=include_dirs,
|
||||
package_data={"": ["*.pyx", "*.pxd", "*.pxi", "*.cpp"]},
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
import warnings
|
||||
import sys
|
||||
|
||||
|
@ -7,7 +5,7 @@ warnings.filterwarnings("ignore", message="numpy.dtype size changed")
|
|||
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.neural.util import prefer_gpu, require_gpu
|
||||
from thinc.api import prefer_gpu, require_gpu
|
||||
|
||||
from . import pipeline
|
||||
from .cli.info import info as cli_info
|
||||
|
@ -23,6 +21,9 @@ if sys.maxunicode == 65535:
|
|||
raise SystemError(Errors.E130)
|
||||
|
||||
|
||||
config = registry
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
depr_path = overrides.get("path")
|
||||
if depr_path not in (True, False, None):
|
||||
|
|
|
@ -1,21 +1,16 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function
|
||||
|
||||
# NB! This breaks in plac on Python 2!!
|
||||
# from __future__ import unicode_literals
|
||||
|
||||
if __name__ == "__main__":
|
||||
import plac
|
||||
import sys
|
||||
from wasabi import msg
|
||||
from spacy.cli import download, link, info, package, train, pretrain, convert
|
||||
from spacy.cli import download, link, info, package, pretrain, convert
|
||||
from spacy.cli import init_model, profile, evaluate, validate, debug_data
|
||||
from spacy.cli import train_cli
|
||||
|
||||
commands = {
|
||||
"download": download,
|
||||
"link": link,
|
||||
"info": info,
|
||||
"train": train,
|
||||
"train": train_cli,
|
||||
"pretrain": pretrain,
|
||||
"debug-data": debug_data,
|
||||
"evaluate": evaluate,
|
||||
|
@ -28,9 +23,9 @@ if __name__ == "__main__":
|
|||
if len(sys.argv) == 1:
|
||||
msg.info("Available commands", ", ".join(commands), exits=1)
|
||||
command = sys.argv.pop(1)
|
||||
sys.argv[0] = "spacy %s" % command
|
||||
sys.argv[0] = f"spacy {command}"
|
||||
if command in commands:
|
||||
plac.call(commands[command], sys.argv[1:])
|
||||
else:
|
||||
available = "Available: {}".format(", ".join(commands))
|
||||
msg.fail("Unknown command: {}".format(command), available, exits=1)
|
||||
available = f"Available: {', '.join(commands)}"
|
||||
msg.fail(f"Unknown command: {command}", available, exits=1)
|
||||
|
|
988
spacy/_ml.py
988
spacy/_ml.py
|
@ -1,988 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
import warnings
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.t2v import Pooling, sum_pool, mean_pool
|
||||
from thinc.i2v import HashEmbed
|
||||
from thinc.misc import Residual, FeatureExtracter
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.api import with_getitem, flatten_add_lengths
|
||||
from thinc.api import uniqued, wrap, noop
|
||||
from thinc.linear.linear import LinearModel
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.util import get_array_module, copy_array
|
||||
from thinc.neural.optimizers import Adam
|
||||
|
||||
from thinc import describe
|
||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||
import thinc.extra.load_nlp
|
||||
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||
from .errors import Errors, Warnings
|
||||
from . import util
|
||||
from . import ml as new_ml
|
||||
from .ml import _legacy_tok2vec
|
||||
|
||||
|
||||
VECTORS_KEY = "spacy_pretrained_vectors"
|
||||
# Backwards compatibility with <2.2.2
|
||||
USE_MODEL_REGISTRY_TOK2VEC = False
|
||||
|
||||
|
||||
def cosine(vec1, vec2):
|
||||
xp = get_array_module(vec1)
|
||||
norm1 = xp.linalg.norm(vec1)
|
||||
norm2 = xp.linalg.norm(vec2)
|
||||
if norm1 == 0.0 or norm2 == 0.0:
|
||||
return 0
|
||||
else:
|
||||
return vec1.dot(vec2) / (norm1 * norm2)
|
||||
|
||||
|
||||
def create_default_optimizer(ops, **cfg):
|
||||
learn_rate = util.env_opt("learn_rate", 0.001)
|
||||
beta1 = util.env_opt("optimizer_B1", 0.9)
|
||||
beta2 = util.env_opt("optimizer_B2", 0.999)
|
||||
eps = util.env_opt("optimizer_eps", 1e-8)
|
||||
L2 = util.env_opt("L2_penalty", 1e-6)
|
||||
max_grad_norm = util.env_opt("grad_norm_clip", 1.0)
|
||||
optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps)
|
||||
optimizer.max_grad_norm = max_grad_norm
|
||||
optimizer.device = ops.device
|
||||
return optimizer
|
||||
|
||||
|
||||
@layerize
|
||||
def _flatten_add_lengths(seqs, pad=0, drop=0.0):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=pad)
|
||||
|
||||
X = ops.flatten(seqs, pad=pad)
|
||||
return (X, lengths), finish_update
|
||||
|
||||
|
||||
def _zero_init(model):
|
||||
def _zero_init_impl(self, *args, **kwargs):
|
||||
self.W.fill(0)
|
||||
|
||||
model.on_init_hooks.append(_zero_init_impl)
|
||||
if model.W is not None:
|
||||
model.W.fill(0.0)
|
||||
return model
|
||||
|
||||
|
||||
def with_cpu(ops, model):
|
||||
"""Wrap a model that should run on CPU, transferring inputs and outputs
|
||||
as necessary."""
|
||||
model.to_cpu()
|
||||
|
||||
def with_cpu_forward(inputs, drop=0.0):
|
||||
cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
|
||||
gpu_outputs = _to_device(ops, cpu_outputs)
|
||||
|
||||
def with_cpu_backprop(d_outputs, sgd=None):
|
||||
cpu_d_outputs = _to_cpu(d_outputs)
|
||||
return backprop(cpu_d_outputs, sgd=sgd)
|
||||
|
||||
return gpu_outputs, with_cpu_backprop
|
||||
|
||||
return wrap(with_cpu_forward, model)
|
||||
|
||||
|
||||
def _to_cpu(X):
|
||||
if isinstance(X, numpy.ndarray):
|
||||
return X
|
||||
elif isinstance(X, tuple):
|
||||
return tuple([_to_cpu(x) for x in X])
|
||||
elif isinstance(X, list):
|
||||
return [_to_cpu(x) for x in X]
|
||||
elif hasattr(X, "get"):
|
||||
return X.get()
|
||||
else:
|
||||
return X
|
||||
|
||||
|
||||
def _to_device(ops, X):
|
||||
if isinstance(X, tuple):
|
||||
return tuple([_to_device(ops, x) for x in X])
|
||||
elif isinstance(X, list):
|
||||
return [_to_device(ops, x) for x in X]
|
||||
else:
|
||||
return ops.asarray(X)
|
||||
|
||||
|
||||
class extract_ngrams(Model):
|
||||
def __init__(self, ngram_size, attr=LOWER):
|
||||
Model.__init__(self)
|
||||
self.ngram_size = ngram_size
|
||||
self.attr = attr
|
||||
|
||||
def begin_update(self, docs, drop=0.0):
|
||||
batch_keys = []
|
||||
batch_vals = []
|
||||
for doc in docs:
|
||||
unigrams = doc.to_array([self.attr])
|
||||
ngrams = [unigrams]
|
||||
for n in range(2, self.ngram_size + 1):
|
||||
ngrams.append(self.ops.ngrams(n, unigrams))
|
||||
keys = self.ops.xp.concatenate(ngrams)
|
||||
keys, vals = self.ops.xp.unique(keys, return_counts=True)
|
||||
batch_keys.append(keys)
|
||||
batch_vals.append(vals)
|
||||
# The dtype here matches what thinc is expecting -- which differs per
|
||||
# platform (by int definition). This should be fixed once the problem
|
||||
# is fixed on Thinc's side.
|
||||
lengths = self.ops.asarray(
|
||||
[arr.shape[0] for arr in batch_keys], dtype=numpy.int_
|
||||
)
|
||||
batch_keys = self.ops.xp.concatenate(batch_keys)
|
||||
batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
|
||||
return (batch_keys, batch_vals, lengths), None
|
||||
|
||||
|
||||
@describe.on_data(
|
||||
_set_dimensions_if_needed, lambda model, X, y: model.init_weights(model)
|
||||
)
|
||||
@describe.attributes(
|
||||
nI=Dimension("Input size"),
|
||||
nF=Dimension("Number of features"),
|
||||
nO=Dimension("Output size"),
|
||||
nP=Dimension("Maxout pieces"),
|
||||
W=Synapses("Weights matrix", lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||
b=Biases("Bias vector", lambda obj: (obj.nO, obj.nP)),
|
||||
pad=Synapses(
|
||||
"Pad",
|
||||
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
||||
lambda M, ops: ops.normal_init(M, 1.0),
|
||||
),
|
||||
d_W=Gradient("W"),
|
||||
d_pad=Gradient("pad"),
|
||||
d_b=Gradient("b"),
|
||||
)
|
||||
class PrecomputableAffine(Model):
|
||||
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
||||
Model.__init__(self, **kwargs)
|
||||
self.nO = nO
|
||||
self.nP = nP
|
||||
self.nI = nI
|
||||
self.nF = nF
|
||||
|
||||
def begin_update(self, X, drop=0.0):
|
||||
Yf = self.ops.gemm(
|
||||
X, self.W.reshape((self.nF * self.nO * self.nP, self.nI)), trans2=True
|
||||
)
|
||||
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||
Yf = self._add_padding(Yf)
|
||||
|
||||
def backward(dY_ids, sgd=None):
|
||||
dY, ids = dY_ids
|
||||
dY, ids = self._backprop_padding(dY, ids)
|
||||
Xf = X[ids]
|
||||
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
||||
|
||||
self.d_b += dY.sum(axis=0)
|
||||
dY = dY.reshape((dY.shape[0], self.nO * self.nP))
|
||||
|
||||
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||
Wopfi = Wopfi.reshape((self.nO * self.nP, self.nF * self.nI))
|
||||
dXf = self.ops.gemm(dY.reshape((dY.shape[0], self.nO * self.nP)), Wopfi)
|
||||
|
||||
# Reuse the buffer
|
||||
dWopfi = Wopfi
|
||||
dWopfi.fill(0.0)
|
||||
self.ops.gemm(dY, Xf, out=dWopfi, trans1=True)
|
||||
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
self.d_W += dWopfi.transpose((2, 0, 1, 3))
|
||||
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
||||
|
||||
return Yf, backward
|
||||
|
||||
def _add_padding(self, Yf):
|
||||
Yf_padded = self.ops.xp.vstack((self.pad, Yf))
|
||||
return Yf_padded
|
||||
|
||||
def _backprop_padding(self, dY, ids):
|
||||
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
|
||||
mask = ids < 0.0
|
||||
mask = mask.sum(axis=1)
|
||||
d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
|
||||
self.d_pad += d_pad.sum(axis=0)
|
||||
return dY, ids
|
||||
|
||||
@staticmethod
|
||||
def init_weights(model):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
if (model.W ** 2).sum() != 0.0:
|
||||
return
|
||||
ops = model.ops
|
||||
xp = ops.xp
|
||||
ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
||||
|
||||
ids = ops.allocate((5000, model.nF), dtype="f")
|
||||
ids += xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.allocate((5000, model.nI), dtype="f")
|
||||
tokvecs += xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens = model(tokvecs[:-1]) # (nW, f, o, p)
|
||||
vectors = model.ops.allocate((ids.shape[0], model.nO * model.nP), dtype="f")
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape(
|
||||
(hiddens.shape[0] * model.nF, model.nO * model.nP)
|
||||
)
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors = vectors.reshape((vectors.shape[0], model.nO, model.nP))
|
||||
vectors += model.b
|
||||
vectors = model.ops.asarray(vectors)
|
||||
if model.nP >= 2:
|
||||
return model.ops.maxout(vectors)[0]
|
||||
else:
|
||||
return vectors * (vectors >= 0)
|
||||
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
t_i = 0
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
model.W /= model.ops.xp.sqrt(var)
|
||||
elif abs(mean) >= tol_mean:
|
||||
model.b -= mean
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def link_vectors_to_models(vocab, skip_rank=False):
|
||||
vectors = vocab.vectors
|
||||
if vectors.name is None:
|
||||
vectors.name = VECTORS_KEY
|
||||
if vectors.data.size != 0:
|
||||
warnings.warn(Warnings.W020.format(shape=vectors.data.shape))
|
||||
ops = Model.ops
|
||||
if not skip_rank:
|
||||
for word in vocab:
|
||||
if word.orth in vectors.key2row:
|
||||
word.rank = vectors.key2row[word.orth]
|
||||
else:
|
||||
word.rank = util.OOV_RANK
|
||||
data = ops.asarray(vectors.data)
|
||||
# Set an entry here, so that vectors are accessed by StaticVectors
|
||||
# (unideal, I know)
|
||||
key = (ops.device, vectors.name)
|
||||
if key in thinc.extra.load_nlp.VECTORS:
|
||||
if thinc.extra.load_nlp.VECTORS[key].shape != data.shape:
|
||||
# This is a hack to avoid the problem in #3853.
|
||||
old_name = vectors.name
|
||||
new_name = vectors.name + "_%d" % data.shape[0]
|
||||
warnings.warn(Warnings.W019.format(old=old_name, new=new_name))
|
||||
vectors.name = new_name
|
||||
key = (ops.device, vectors.name)
|
||||
thinc.extra.load_nlp.VECTORS[key] = data
|
||||
|
||||
|
||||
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
||||
import torch.nn
|
||||
from thinc.api import with_square_sequences
|
||||
from thinc.extra.wrappers import PyTorchWrapperRNN
|
||||
|
||||
if depth == 0:
|
||||
return layerize(noop())
|
||||
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
|
||||
return with_square_sequences(PyTorchWrapperRNN(model))
|
||||
|
||||
|
||||
def Tok2Vec(width, embed_size, **kwargs):
|
||||
if not USE_MODEL_REGISTRY_TOK2VEC:
|
||||
# Preserve prior tok2vec for backwards compat, in v2.2.2
|
||||
return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
|
||||
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
||||
subword_features = kwargs.get("subword_features", True)
|
||||
char_embed = kwargs.get("char_embed", False)
|
||||
conv_depth = kwargs.get("conv_depth", 4)
|
||||
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
||||
conv_window = kwargs.get("conv_window", 1)
|
||||
|
||||
cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
|
||||
|
||||
doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
|
||||
if char_embed:
|
||||
embed_cfg = {
|
||||
"arch": "spacy.CharacterEmbed.v1",
|
||||
"config": {
|
||||
"width": 64,
|
||||
"chars": 6,
|
||||
"@mix": {
|
||||
"arch": "spacy.LayerNormalizedMaxout.v1",
|
||||
"config": {"width": width, "pieces": 3},
|
||||
},
|
||||
"@embed_features": None,
|
||||
},
|
||||
}
|
||||
else:
|
||||
embed_cfg = {
|
||||
"arch": "spacy.MultiHashEmbed.v1",
|
||||
"config": {
|
||||
"width": width,
|
||||
"rows": embed_size,
|
||||
"columns": cols,
|
||||
"use_subwords": subword_features,
|
||||
"@pretrained_vectors": None,
|
||||
"@mix": {
|
||||
"arch": "spacy.LayerNormalizedMaxout.v1",
|
||||
"config": {"width": width, "pieces": 3},
|
||||
},
|
||||
},
|
||||
}
|
||||
if pretrained_vectors:
|
||||
embed_cfg["config"]["@pretrained_vectors"] = {
|
||||
"arch": "spacy.PretrainedVectors.v1",
|
||||
"config": {
|
||||
"vectors_name": pretrained_vectors,
|
||||
"width": width,
|
||||
"column": cols.index("ID"),
|
||||
},
|
||||
}
|
||||
if cnn_maxout_pieces >= 2:
|
||||
cnn_cfg = {
|
||||
"arch": "spacy.MaxoutWindowEncoder.v1",
|
||||
"config": {
|
||||
"width": width,
|
||||
"window_size": conv_window,
|
||||
"pieces": cnn_maxout_pieces,
|
||||
"depth": conv_depth,
|
||||
},
|
||||
}
|
||||
else:
|
||||
cnn_cfg = {
|
||||
"arch": "spacy.MishWindowEncoder.v1",
|
||||
"config": {"width": width, "window_size": conv_window, "depth": conv_depth},
|
||||
}
|
||||
bilstm_cfg = {
|
||||
"arch": "spacy.TorchBiLSTMEncoder.v1",
|
||||
"config": {"width": width, "depth": bilstm_depth},
|
||||
}
|
||||
if conv_depth == 0 and bilstm_depth == 0:
|
||||
encode_cfg = {}
|
||||
elif conv_depth >= 1 and bilstm_depth >= 1:
|
||||
encode_cfg = {
|
||||
"arch": "thinc.FeedForward.v1",
|
||||
"config": {"children": [cnn_cfg, bilstm_cfg]},
|
||||
}
|
||||
elif conv_depth >= 1:
|
||||
encode_cfg = cnn_cfg
|
||||
else:
|
||||
encode_cfg = bilstm_cfg
|
||||
config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
|
||||
return new_ml.Tok2Vec(config)
|
||||
|
||||
|
||||
def reapply(layer, n_times):
|
||||
def reapply_fwd(X, drop=0.0):
|
||||
backprops = []
|
||||
for i in range(n_times):
|
||||
Y, backprop = layer.begin_update(X, drop=drop)
|
||||
X = Y
|
||||
backprops.append(backprop)
|
||||
|
||||
def reapply_bwd(dY, sgd=None):
|
||||
dX = None
|
||||
for backprop in reversed(backprops):
|
||||
dY = backprop(dY, sgd=sgd)
|
||||
if dX is None:
|
||||
dX = dY
|
||||
else:
|
||||
dX += dY
|
||||
return dX
|
||||
|
||||
return Y, reapply_bwd
|
||||
|
||||
return wrap(reapply_fwd, layer)
|
||||
|
||||
|
||||
def asarray(ops, dtype):
|
||||
def forward(X, drop=0.0):
|
||||
return ops.asarray(X, dtype=dtype), None
|
||||
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
def _divide_array(X, size):
|
||||
parts = []
|
||||
index = 0
|
||||
while index < len(X):
|
||||
parts.append(X[index : index + size])
|
||||
index += size
|
||||
return parts
|
||||
|
||||
|
||||
def get_col(idx):
|
||||
if idx < 0:
|
||||
raise IndexError(Errors.E066.format(value=idx))
|
||||
|
||||
def forward(X, drop=0.0):
|
||||
if isinstance(X, numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
else:
|
||||
ops = CupyOps()
|
||||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||
|
||||
def backward(y, sgd=None):
|
||||
dX = ops.allocate(X.shape)
|
||||
dX[:, idx] += y
|
||||
return dX
|
||||
|
||||
return output, backward
|
||||
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
def doc2feats(cols=None):
|
||||
if cols is None:
|
||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
|
||||
def forward(docs, drop=0.0):
|
||||
feats = []
|
||||
for doc in docs:
|
||||
feats.append(doc.to_array(cols))
|
||||
return feats, None
|
||||
|
||||
model = layerize(forward)
|
||||
model.cols = cols
|
||||
return model
|
||||
|
||||
|
||||
def print_shape(prefix):
|
||||
def forward(X, drop=0.0):
|
||||
return X, lambda dX, **kwargs: dX
|
||||
|
||||
return layerize(forward)
|
||||
|
||||
|
||||
@layerize
|
||||
def get_token_vectors(tokens_attrs_vectors, drop=0.0):
|
||||
tokens, attrs, vectors = tokens_attrs_vectors
|
||||
|
||||
def backward(d_output, sgd=None):
|
||||
return (tokens, d_output)
|
||||
|
||||
return vectors, backward
|
||||
|
||||
|
||||
@layerize
|
||||
def logistic(X, drop=0.0):
|
||||
xp = get_array_module(X)
|
||||
if not isinstance(X, xp.ndarray):
|
||||
X = xp.asarray(X)
|
||||
# Clip to range (-10, 10)
|
||||
X = xp.minimum(X, 10.0, X)
|
||||
X = xp.maximum(X, -10.0, X)
|
||||
Y = 1.0 / (1.0 + xp.exp(-X))
|
||||
|
||||
def logistic_bwd(dY, sgd=None):
|
||||
dX = dY * (Y * (1 - Y))
|
||||
return dX
|
||||
|
||||
return Y, logistic_bwd
|
||||
|
||||
|
||||
def zero_init(model):
|
||||
def _zero_init_impl(self, X, y):
|
||||
self.W.fill(0)
|
||||
|
||||
model.on_data_hooks.append(_zero_init_impl)
|
||||
return model
|
||||
|
||||
|
||||
def getitem(i):
|
||||
def getitem_fwd(X, drop=0.0):
|
||||
return X[i], None
|
||||
|
||||
return layerize(getitem_fwd)
|
||||
|
||||
|
||||
@describe.attributes(
|
||||
W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
|
||||
)
|
||||
class MultiSoftmax(Affine):
|
||||
"""Neural network layer that predicts several multi-class attributes at once.
|
||||
For instance, we might predict one class with 6 variables, and another with 5.
|
||||
We predict the 11 neurons required for this, and then softmax them such
|
||||
that columns 0-6 make a probability distribution and coumns 6-11 make another.
|
||||
"""
|
||||
|
||||
name = "multisoftmax"
|
||||
|
||||
def __init__(self, out_sizes, nI=None, **kwargs):
|
||||
Model.__init__(self, **kwargs)
|
||||
self.out_sizes = out_sizes
|
||||
self.nO = sum(out_sizes)
|
||||
self.nI = nI
|
||||
|
||||
def predict(self, input__BI):
|
||||
output__BO = self.ops.affine(self.W, self.b, input__BI)
|
||||
i = 0
|
||||
for out_size in self.out_sizes:
|
||||
self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
|
||||
i += out_size
|
||||
return output__BO
|
||||
|
||||
def begin_update(self, input__BI, drop=0.0):
|
||||
output__BO = self.predict(input__BI)
|
||||
|
||||
def finish_update(grad__BO, sgd=None):
|
||||
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
|
||||
self.d_b += grad__BO.sum(axis=0)
|
||||
grad__BI = self.ops.gemm(grad__BO, self.W)
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return grad__BI
|
||||
|
||||
return output__BO, finish_update
|
||||
|
||||
|
||||
def build_tagger_model(nr_class, **cfg):
|
||||
embed_size = util.env_opt("embed_size", 2000)
|
||||
if "token_vector_width" in cfg:
|
||||
token_vector_width = cfg["token_vector_width"]
|
||||
else:
|
||||
token_vector_width = util.env_opt("token_vector_width", 96)
|
||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||
subword_features = cfg.get("subword_features", True)
|
||||
with Model.define_operators({">>": chain, "+": add}):
|
||||
if "tok2vec" in cfg:
|
||||
tok2vec = cfg["tok2vec"]
|
||||
else:
|
||||
tok2vec = Tok2Vec(
|
||||
token_vector_width,
|
||||
embed_size,
|
||||
subword_features=subword_features,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
)
|
||||
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||
model = tok2vec >> softmax
|
||||
model.nI = None
|
||||
model.tok2vec = tok2vec
|
||||
model.softmax = softmax
|
||||
return model
|
||||
|
||||
|
||||
def build_morphologizer_model(class_nums, **cfg):
|
||||
embed_size = util.env_opt("embed_size", 7000)
|
||||
if "token_vector_width" in cfg:
|
||||
token_vector_width = cfg["token_vector_width"]
|
||||
else:
|
||||
token_vector_width = util.env_opt("token_vector_width", 128)
|
||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||
char_embed = cfg.get("char_embed", True)
|
||||
with Model.define_operators({">>": chain, "+": add, "**": clone}):
|
||||
if "tok2vec" in cfg:
|
||||
tok2vec = cfg["tok2vec"]
|
||||
else:
|
||||
tok2vec = Tok2Vec(
|
||||
token_vector_width,
|
||||
embed_size,
|
||||
char_embed=char_embed,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
)
|
||||
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
||||
softmax.out_sizes = class_nums
|
||||
model = tok2vec >> softmax
|
||||
model.nI = None
|
||||
model.tok2vec = tok2vec
|
||||
model.softmax = softmax
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def SpacyVectors(docs, drop=0.0):
|
||||
batch = []
|
||||
for doc in docs:
|
||||
indices = numpy.zeros((len(doc),), dtype="i")
|
||||
for i, word in enumerate(doc):
|
||||
if word.orth in doc.vocab.vectors.key2row:
|
||||
indices[i] = doc.vocab.vectors.key2row[word.orth]
|
||||
else:
|
||||
indices[i] = 0
|
||||
vectors = doc.vocab.vectors.data[indices]
|
||||
batch.append(vectors)
|
||||
return batch, None
|
||||
|
||||
|
||||
def build_text_classifier(nr_class, width=64, **cfg):
|
||||
depth = cfg.get("depth", 2)
|
||||
nr_vector = cfg.get("nr_vector", 5000)
|
||||
pretrained_dims = cfg.get("pretrained_dims", 0)
|
||||
with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
|
||||
if cfg.get("low_data") and pretrained_dims:
|
||||
model = (
|
||||
SpacyVectors
|
||||
>> flatten_add_lengths
|
||||
>> with_getitem(0, Affine(width, pretrained_dims))
|
||||
>> ParametricAttention(width)
|
||||
>> Pooling(sum_pool)
|
||||
>> Residual(ReLu(width, width)) ** 2
|
||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||
>> logistic
|
||||
)
|
||||
return model
|
||||
|
||||
lower = HashEmbed(width, nr_vector, column=1)
|
||||
prefix = HashEmbed(width // 2, nr_vector, column=2)
|
||||
suffix = HashEmbed(width // 2, nr_vector, column=3)
|
||||
shape = HashEmbed(width // 2, nr_vector, column=4)
|
||||
|
||||
trained_vectors = FeatureExtracter(
|
||||
[ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
) >> with_flatten(
|
||||
uniqued(
|
||||
(lower | prefix | suffix | shape)
|
||||
>> LN(Maxout(width, width + (width // 2) * 3)),
|
||||
column=0,
|
||||
)
|
||||
)
|
||||
|
||||
if pretrained_dims:
|
||||
static_vectors = SpacyVectors >> with_flatten(
|
||||
Affine(width, pretrained_dims)
|
||||
)
|
||||
# TODO Make concatenate support lists
|
||||
vectors = concatenate_lists(trained_vectors, static_vectors)
|
||||
vectors_width = width * 2
|
||||
else:
|
||||
vectors = trained_vectors
|
||||
vectors_width = width
|
||||
static_vectors = None
|
||||
tok2vec = vectors >> with_flatten(
|
||||
LN(Maxout(width, vectors_width))
|
||||
>> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
|
||||
pad=depth,
|
||||
)
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
>> flatten_add_lengths
|
||||
>> ParametricAttention(width)
|
||||
>> Pooling(sum_pool)
|
||||
>> Residual(zero_init(Maxout(width, width)))
|
||||
>> zero_init(Affine(nr_class, width, drop_factor=0.0))
|
||||
)
|
||||
|
||||
linear_model = build_bow_text_classifier(
|
||||
nr_class,
|
||||
ngram_size=cfg.get("ngram_size", 1),
|
||||
exclusive_classes=cfg.get("exclusive_classes", False),
|
||||
)
|
||||
if cfg.get("exclusive_classes", False):
|
||||
output_layer = Softmax(nr_class, nr_class * 2)
|
||||
else:
|
||||
output_layer = (
|
||||
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
|
||||
)
|
||||
model = (linear_model | cnn_model) >> output_layer
|
||||
model.tok2vec = chain(tok2vec, flatten)
|
||||
model.nO = nr_class
|
||||
model.lsuv = False
|
||||
return model
|
||||
|
||||
|
||||
def build_bow_text_classifier(
|
||||
nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
|
||||
):
|
||||
with Model.define_operators({">>": chain}):
|
||||
model = with_cpu(
|
||||
Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
|
||||
)
|
||||
if not no_output_layer:
|
||||
model = model >> (cpu_softmax if exclusive_classes else logistic)
|
||||
model.nO = nr_class
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def cpu_softmax(X, drop=0.0):
|
||||
ops = NumpyOps()
|
||||
|
||||
def cpu_softmax_backward(dY, sgd=None):
|
||||
return dY
|
||||
|
||||
return ops.softmax(X), cpu_softmax_backward
|
||||
|
||||
|
||||
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
|
||||
"""
|
||||
Build a simple CNN text classifier, given a token-to-vector model as inputs.
|
||||
If exclusive_classes=True, a softmax non-linearity is applied, so that the
|
||||
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
||||
is applied instead, so that outputs are in the range [0, 1].
|
||||
"""
|
||||
with Model.define_operators({">>": chain}):
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nr_class, tok2vec.nO)
|
||||
else:
|
||||
output_layer = (
|
||||
zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
|
||||
)
|
||||
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
|
||||
model.tok2vec = chain(tok2vec, flatten)
|
||||
model.nO = nr_class
|
||||
return model
|
||||
|
||||
|
||||
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
|
||||
if "entity_width" not in cfg:
|
||||
raise ValueError(Errors.E144.format(param="entity_width"))
|
||||
|
||||
conv_depth = cfg.get("conv_depth", 2)
|
||||
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
||||
pretrained_vectors = cfg.get("pretrained_vectors", None)
|
||||
context_width = cfg.get("entity_width")
|
||||
|
||||
with Model.define_operators({">>": chain, "**": clone}):
|
||||
# context encoder
|
||||
tok2vec = Tok2Vec(
|
||||
width=hidden_width,
|
||||
embed_size=embed_width,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
cnn_maxout_pieces=cnn_maxout_pieces,
|
||||
subword_features=True,
|
||||
conv_depth=conv_depth,
|
||||
bilstm_depth=0,
|
||||
)
|
||||
|
||||
model = (
|
||||
tok2vec
|
||||
>> flatten_add_lengths
|
||||
>> Pooling(mean_pool)
|
||||
>> Residual(zero_init(Maxout(hidden_width, hidden_width)))
|
||||
>> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
|
||||
)
|
||||
|
||||
model.tok2vec = tok2vec
|
||||
model.nO = context_width
|
||||
return model
|
||||
|
||||
|
||||
@layerize
|
||||
def flatten(seqs, drop=0.0):
|
||||
ops = Model.ops
|
||||
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||
|
||||
def finish_update(d_X, sgd=None):
|
||||
return ops.unflatten(d_X, lengths, pad=0)
|
||||
|
||||
X = ops.flatten(seqs, pad=0)
|
||||
return X, finish_update
|
||||
|
||||
|
||||
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||
"""Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
||||
"""
|
||||
if not layers:
|
||||
return noop()
|
||||
drop_factor = kwargs.get("drop_factor", 1.0)
|
||||
ops = layers[0].ops
|
||||
layers = [chain(layer, flatten) for layer in layers]
|
||||
concat = concatenate(*layers)
|
||||
|
||||
def concatenate_lists_fwd(Xs, drop=0.0):
|
||||
if drop is not None:
|
||||
drop *= drop_factor
|
||||
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||
ys = ops.unflatten(flat_y, lengths)
|
||||
|
||||
def concatenate_lists_bwd(d_ys, sgd=None):
|
||||
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
||||
|
||||
return ys, concatenate_lists_bwd
|
||||
|
||||
model = wrap(concatenate_lists_fwd, concat)
|
||||
return model
|
||||
|
||||
|
||||
def masked_language_model(vocab, model, mask_prob=0.15):
|
||||
"""Convert a model into a BERT-style masked language model"""
|
||||
|
||||
random_words = _RandomWords(vocab)
|
||||
|
||||
def mlm_forward(docs, drop=0.0):
|
||||
mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob)
|
||||
mask = model.ops.asarray(mask).reshape((mask.shape[0], 1))
|
||||
output, backprop = model.begin_update(docs, drop=drop)
|
||||
|
||||
def mlm_backward(d_output, sgd=None):
|
||||
d_output *= 1 - mask
|
||||
return backprop(d_output, sgd=sgd)
|
||||
|
||||
return output, mlm_backward
|
||||
|
||||
return wrap(mlm_forward, model)
|
||||
|
||||
|
||||
class _RandomWords(object):
|
||||
def __init__(self, vocab):
|
||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||
self.words = self.words[:10000]
|
||||
self.probs = self.probs[:10000]
|
||||
self.probs = numpy.exp(numpy.array(self.probs, dtype="f"))
|
||||
self.probs /= self.probs.sum()
|
||||
self._cache = []
|
||||
|
||||
def next(self):
|
||||
if not self._cache:
|
||||
self._cache.extend(
|
||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||
)
|
||||
index = self._cache.pop()
|
||||
return self.words[index]
|
||||
|
||||
|
||||
def _apply_mask(docs, random_words, mask_prob=0.15):
|
||||
# This needs to be here to avoid circular imports
|
||||
from .tokens.doc import Doc
|
||||
|
||||
N = sum(len(doc) for doc in docs)
|
||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||
mask = mask >= mask_prob
|
||||
i = 0
|
||||
masked_docs = []
|
||||
for doc in docs:
|
||||
words = []
|
||||
for token in doc:
|
||||
if not mask[i]:
|
||||
word = _replace_word(token.text, random_words)
|
||||
else:
|
||||
word = token.text
|
||||
words.append(word)
|
||||
i += 1
|
||||
spaces = [bool(w.whitespace_) for w in doc]
|
||||
# NB: If you change this implementation to instead modify
|
||||
# the docs in place, take care that the IDs reflect the original
|
||||
# words. Currently we use the original docs to make the vectors
|
||||
# for the target, so we don't lose the original tokens. But if
|
||||
# you modified the docs in place here, you would.
|
||||
masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces))
|
||||
return mask, masked_docs
|
||||
|
||||
|
||||
def _replace_word(word, random_words, mask="[MASK]"):
|
||||
roll = numpy.random.random()
|
||||
if roll < 0.8:
|
||||
return mask
|
||||
elif roll < 0.9:
|
||||
return random_words.next()
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def _uniform_init(lo, hi):
|
||||
def wrapped(W, ops):
|
||||
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
|
||||
|
||||
return wrapped
|
||||
|
||||
|
||||
@describe.attributes(
|
||||
nM=Dimension("Vector dimensions"),
|
||||
nC=Dimension("Number of characters per word"),
|
||||
vectors=Synapses(
|
||||
"Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
|
||||
),
|
||||
d_vectors=Gradient("vectors"),
|
||||
)
|
||||
class CharacterEmbed(Model):
|
||||
def __init__(self, nM=None, nC=None, **kwargs):
|
||||
Model.__init__(self, **kwargs)
|
||||
self.nM = nM
|
||||
self.nC = nC
|
||||
|
||||
@property
|
||||
def nO(self):
|
||||
return self.nM * self.nC
|
||||
|
||||
@property
|
||||
def nV(self):
|
||||
return 256
|
||||
|
||||
def begin_update(self, docs, drop=0.0):
|
||||
if not docs:
|
||||
return []
|
||||
ids = []
|
||||
output = []
|
||||
weights = self.vectors
|
||||
# This assists in indexing; it's like looping over this dimension.
|
||||
# Still consider this weird witch craft...But thanks to Mark Neumann
|
||||
# for the tip.
|
||||
nCv = self.ops.xp.arange(self.nC)
|
||||
for doc in docs:
|
||||
doc_ids = doc.to_utf8_array(nr_char=self.nC)
|
||||
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
|
||||
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
|
||||
# incantation do I chant to get
|
||||
# output[i, j, k] == data[j, ids[i, j], k]?
|
||||
doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
|
||||
output.append(doc_vectors.reshape((len(doc), self.nO)))
|
||||
ids.append(doc_ids)
|
||||
|
||||
def backprop_character_embed(d_vectors, sgd=None):
|
||||
gradient = self.d_vectors
|
||||
for doc_ids, d_doc_vectors in zip(ids, d_vectors):
|
||||
d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
|
||||
gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
|
||||
if sgd is not None:
|
||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return None
|
||||
|
||||
return output, backprop_character_embed
|
||||
|
||||
|
||||
def get_cossim_loss(yh, y, ignore_zeros=False):
|
||||
xp = get_array_module(yh)
|
||||
# Find the zero vectors
|
||||
if ignore_zeros:
|
||||
zero_indices = xp.abs(y).sum(axis=1) == 0
|
||||
# Add a small constant to avoid 0 vectors
|
||||
yh = yh + 1e-8
|
||||
y = y + 1e-8
|
||||
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
||||
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
||||
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
||||
mul_norms = norm_yh * norm_y
|
||||
cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||
d_yh = (y / mul_norms) - (cosine * (yh / norm_yh ** 2))
|
||||
losses = xp.abs(cosine - 1)
|
||||
if ignore_zeros:
|
||||
# If the target was a zero vector, don't count it in the loss.
|
||||
d_yh[zero_indices] = 0
|
||||
losses[zero_indices] = 0
|
||||
loss = losses.sum()
|
||||
return loss, -d_yh
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "2.3.0"
|
||||
__version__ = "3.0.0.dev9"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -91,6 +91,7 @@ cdef enum attr_id_t:
|
|||
|
||||
LANG
|
||||
ENT_KB_ID = symbols.ENT_KB_ID
|
||||
MORPH
|
||||
ENT_ID = symbols.ENT_ID
|
||||
|
||||
IDX
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
|
@ -92,6 +89,7 @@ IDS = {
|
|||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
"LANG": LANG,
|
||||
"MORPH": MORPH,
|
||||
"IDX": IDX
|
||||
}
|
||||
|
||||
|
|
|
@ -1,12 +1,20 @@
|
|||
from wasabi import msg
|
||||
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .link import link # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train import train # noqa: F401
|
||||
from .train_from_config import train_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_model import init_model # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
|
||||
|
||||
def link(*args, **kwargs):
|
||||
msg.warn(
|
||||
"As of spaCy v3.0, model symlinks are deprecated. You can load models "
|
||||
"using their full names or from a directory path."
|
||||
)
|
||||
|
|
|
@ -1,220 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# NB: This schema describes the new format of the training data, see #2928
|
||||
TRAINING_SCHEMA = {
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"title": "Training data for spaCy models",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"title": "The text of the training example",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"ents": {
|
||||
"title": "Named entity spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"label": {
|
||||
"title": "Entity label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[A-Z0-9]*$",
|
||||
},
|
||||
},
|
||||
"required": ["start", "end", "label"],
|
||||
},
|
||||
},
|
||||
"sents": {
|
||||
"title": "Sentence spans in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {
|
||||
"title": "Start character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the span",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
"required": ["start", "end"],
|
||||
},
|
||||
},
|
||||
"cats": {
|
||||
"title": "Text categories for the text classifier",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"title": "A text category",
|
||||
"oneOf": [
|
||||
{"type": "boolean"},
|
||||
{"type": "number", "minimum": 0},
|
||||
],
|
||||
}
|
||||
},
|
||||
"propertyNames": {"pattern": "^[A-Z0-9]*$", "minLength": 1},
|
||||
},
|
||||
"tokens": {
|
||||
"title": "The tokens in the text",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"minProperties": 1,
|
||||
"properties": {
|
||||
"id": {
|
||||
"title": "Token ID, usually token index",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"start": {
|
||||
"title": "Start character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"end": {
|
||||
"title": "End character offset of the token",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"pos": {
|
||||
"title": "Coarse-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"tag": {
|
||||
"title": "Fine-grained part-of-speech tag",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"dep": {
|
||||
"title": "Dependency label",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
},
|
||||
"head": {
|
||||
"title": "Index of the token's head",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
"required": ["start", "end"],
|
||||
},
|
||||
},
|
||||
"_": {"title": "Custom user space", "type": "object"},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
}
|
||||
|
||||
META_SCHEMA = {
|
||||
"$schema": "http://json-schema.org/draft-06/schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lang": {
|
||||
"title": "Two-letter language code, e.g. 'en'",
|
||||
"type": "string",
|
||||
"minLength": 2,
|
||||
"maxLength": 2,
|
||||
"pattern": "^[a-z]*$",
|
||||
},
|
||||
"name": {
|
||||
"title": "Model name",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[a-z_]*$",
|
||||
},
|
||||
"version": {
|
||||
"title": "Model version",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-]*$",
|
||||
},
|
||||
"spacy_version": {
|
||||
"title": "Compatible spaCy version identifier",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"pattern": "^[0-9a-z.-><=]*$",
|
||||
},
|
||||
"parent_package": {
|
||||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"default": "spacy",
|
||||
},
|
||||
"pipeline": {
|
||||
"title": "Names of pipeline components",
|
||||
"type": "array",
|
||||
"items": {"type": "string", "minLength": 1},
|
||||
},
|
||||
"description": {"title": "Model description", "type": "string"},
|
||||
"license": {"title": "Model license", "type": "string"},
|
||||
"author": {"title": "Model author name", "type": "string"},
|
||||
"email": {"title": "Model author email", "type": "string", "format": "email"},
|
||||
"url": {"title": "Model author URL", "type": "string", "format": "uri"},
|
||||
"sources": {
|
||||
"title": "Training data sources",
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Included word vectors",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"keys": {
|
||||
"title": "Number of unique keys",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"vectors": {
|
||||
"title": "Number of unique vectors",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
"width": {
|
||||
"title": "Number of dimensions",
|
||||
"type": "integer",
|
||||
"minimum": 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
"accuracy": {
|
||||
"title": "Accuracy numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {"*": {"type": "number", "minimum": 0.0}},
|
||||
},
|
||||
"speed": {
|
||||
"title": "Speed evaluation numbers",
|
||||
"type": "object",
|
||||
"patternProperties": {
|
||||
"*": {
|
||||
"oneOf": [
|
||||
{"type": "number", "minimum": 0.0},
|
||||
{"type": "integer", "minimum": 0},
|
||||
]
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
"required": ["lang", "name", "version"],
|
||||
}
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
|
@ -29,27 +25,20 @@ FILE_TYPES = ("json", "jsonl", "msg")
|
|||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||
n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int),
|
||||
seg_sents=("Segment sentences (for -c ner)", "flag", "s"),
|
||||
model=("Model for sentence segmentation (for -s)", "option", "b", str),
|
||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
)
|
||||
def convert(
|
||||
input_file,
|
||||
output_dir="-",
|
||||
file_type="json",
|
||||
n_sents=1,
|
||||
seg_sents=False,
|
||||
model=None,
|
||||
morphology=False,
|
||||
converter="auto",
|
||||
lang=None,
|
||||
# fmt: off
|
||||
input_file: ("Input file", "positional", None, str),
|
||||
output_dir: ("Output directory. '-' for stdout.", "positional", None, str) = "-",
|
||||
file_type: (f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES) = "json",
|
||||
n_sents: ("Number of sentences per doc (0 to disable)", "option", "n", int) = 1,
|
||||
seg_sents: ("Segment sentences (for -c ner)", "flag", "s") = False,
|
||||
model: ("Model for sentence segmentation (for -s)", "option", "b", str) = None,
|
||||
morphology: ("Enable appending morphology to tags", "flag", "m", bool) = False,
|
||||
merge_subtokens: ("Merge CoNLL-U subtokens", "flag", "T", bool) = False,
|
||||
converter: (f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str) = "auto",
|
||||
ner_map_path: ("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path) = None,
|
||||
lang: ("Language (if tokenizer required)", "option", "l", str) = None,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Convert files into JSON format for use with train command and other
|
||||
|
@ -60,16 +49,10 @@ def convert(
|
|||
no_print = output_dir == "-"
|
||||
msg = Printer(no_print=no_print)
|
||||
input_path = Path(input_file)
|
||||
if file_type not in FILE_TYPES:
|
||||
msg.fail(
|
||||
"Unknown file type: '{}'".format(file_type),
|
||||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
msg.fail(
|
||||
"Can't write .{} data to stdout.".format(file_type),
|
||||
f"Can't write .{file_type} data to stdout",
|
||||
"Please specify an output directory.",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -93,21 +76,26 @@ def convert(
|
|||
"Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert"
|
||||
)
|
||||
if converter not in CONVERTERS:
|
||||
msg.fail("Can't find converter for {}".format(converter), exits=1)
|
||||
msg.fail(f"Can't find converter for {converter}", exits=1)
|
||||
ner_map = None
|
||||
if ner_map_path is not None:
|
||||
ner_map = srsly.read_json(ner_map_path)
|
||||
# Use converter function to convert data
|
||||
func = CONVERTERS[converter]
|
||||
data = func(
|
||||
input_data,
|
||||
n_sents=n_sents,
|
||||
seg_sents=seg_sents,
|
||||
use_morphology=morphology,
|
||||
append_morphology=morphology,
|
||||
merge_subtokens=merge_subtokens,
|
||||
lang=lang,
|
||||
model=model,
|
||||
no_print=no_print,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
if output_dir != "-":
|
||||
# Export data to a file
|
||||
suffix = ".{}".format(file_type)
|
||||
suffix = f".{file_type}"
|
||||
output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix)
|
||||
if file_type == "json":
|
||||
srsly.write_json(output_file, data)
|
||||
|
@ -115,9 +103,7 @@ def convert(
|
|||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good(
|
||||
"Generated output file ({} documents): {}".format(len(data), output_file)
|
||||
)
|
||||
msg.good(f"Generated output file ({len(data)} documents): {output_file}")
|
||||
else:
|
||||
# Print to stdout
|
||||
if file_type == "json":
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
@ -64,9 +61,9 @@ def conll_ner2json(
|
|||
# sentence segmentation required for document segmentation
|
||||
if n_sents > 0 and not seg_sents:
|
||||
msg.warn(
|
||||
"No sentence boundaries found to use with option `-n {}`. "
|
||||
"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
"to disable.".format(n_sents)
|
||||
f"No sentence boundaries found to use with option `-n {n_sents}`. "
|
||||
f"Use `-s` to automatically segment sentences or `-n 0` "
|
||||
f"to disable."
|
||||
)
|
||||
else:
|
||||
n_sents_info(msg, n_sents)
|
||||
|
@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
|
|||
if model:
|
||||
nlp = load_model(model)
|
||||
if "parser" in nlp.pipe_names:
|
||||
msg.info("Segmenting sentences with parser from model '{}'.".format(model))
|
||||
msg.info(f"Segmenting sentences with parser from model '{model}'.")
|
||||
sentencizer = nlp.get_pipe("parser")
|
||||
if not sentencizer:
|
||||
msg.info(
|
||||
|
@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter):
|
|||
|
||||
|
||||
def n_sents_info(msg, n_sents):
|
||||
msg.info("Grouping every {} sentences into a document.".format(n_sents))
|
||||
msg.info(f"Grouping every {n_sents} sentences into a document.")
|
||||
if n_sents == 1:
|
||||
msg.warn(
|
||||
"To generate better training data, you may want to group "
|
||||
|
|
|
@ -1,141 +1,349 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
from ...gold import Example
|
||||
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token
|
||||
from .conll_ner2json import n_sents_info
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
||||
def conllu2json(
|
||||
input_data,
|
||||
n_sents=10,
|
||||
append_morphology=False,
|
||||
lang=None,
|
||||
ner_map=None,
|
||||
merge_subtokens=False,
|
||||
no_print=False,
|
||||
**_
|
||||
):
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
append_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
|
||||
Extract NER tags if available and convert them so that they follow
|
||||
BILUO and the Wikipedia scheme
|
||||
"""
|
||||
# by @dvsrepo, via #11 explosion/spacy-dev-resources
|
||||
# by @katarkor
|
||||
MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
|
||||
msg = Printer(no_print=no_print)
|
||||
n_sents_info(msg, n_sents)
|
||||
docs = []
|
||||
raw = ""
|
||||
sentences = []
|
||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
||||
checked_for_ner = False
|
||||
has_ner_tags = False
|
||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
||||
sentence, brackets = tokens[0]
|
||||
if not checked_for_ner:
|
||||
has_ner_tags = is_ner(sentence[5][0])
|
||||
checked_for_ner = True
|
||||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
||||
conll_data = read_conllx(
|
||||
input_data,
|
||||
append_morphology=append_morphology,
|
||||
ner_tag_pattern=MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
merge_subtokens=merge_subtokens,
|
||||
)
|
||||
has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
|
||||
for i, example in enumerate(conll_data):
|
||||
raw += example.text
|
||||
sentences.append(
|
||||
generate_sentence(
|
||||
example.token_annotation,
|
||||
has_ner_tags,
|
||||
MISC_NER_PATTERN,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
)
|
||||
# Real-sized documents could be extracted using the comments on the
|
||||
# conluu document
|
||||
# conllu document
|
||||
if len(sentences) % n_sents == 0:
|
||||
doc = create_doc(sentences, i)
|
||||
doc = create_json_doc(raw, sentences, i)
|
||||
docs.append(doc)
|
||||
raw = ""
|
||||
sentences = []
|
||||
if sentences:
|
||||
doc = create_doc(sentences, i)
|
||||
doc = create_json_doc(raw, sentences, i)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
||||
|
||||
def is_ner(tag):
|
||||
def has_ner(input_data, ner_tag_pattern):
|
||||
"""
|
||||
Check the 10th column of the first token to determine if the file contains
|
||||
NER tags
|
||||
Check the MISC column for NER tags.
|
||||
"""
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
return True
|
||||
elif tag == "O":
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def read_conllx(input_data, use_morphology=False, n=0):
|
||||
i = 0
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
tokens = []
|
||||
for line in lines:
|
||||
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
try:
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head not in ["0", "_"] else id_
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
tag = pos if tag == "_" else tag
|
||||
tag = tag + "__" + morph if use_morphology else tag
|
||||
iob = iob if iob else "O"
|
||||
tokens.append((id_, word, tag, head, dep, iob))
|
||||
except: # noqa: E722
|
||||
print(line)
|
||||
raise
|
||||
tuples = [list(t) for t in zip(*tokens)]
|
||||
yield (None, [[tuples, []]])
|
||||
i += 1
|
||||
if n >= 1 and i >= n:
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
for misc_part in misc.split("|"):
|
||||
if re.match(ner_tag_pattern, misc_part):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def read_conllx(
|
||||
input_data,
|
||||
append_morphology=False,
|
||||
merge_subtokens=False,
|
||||
ner_tag_pattern="",
|
||||
ner_map=None,
|
||||
):
|
||||
""" Yield examples, one for each sentence """
|
||||
vocab = Language.Defaults.create_vocab() # need vocab to make a minimal Doc
|
||||
for sent in input_data.strip().split("\n\n"):
|
||||
lines = sent.strip().split("\n")
|
||||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
example = example_from_conllu_sentence(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
merge_subtokens=merge_subtokens,
|
||||
append_morphology=append_morphology,
|
||||
ner_map=ner_map,
|
||||
)
|
||||
yield example
|
||||
|
||||
|
||||
def get_entities(lines, tag_pattern, ner_map=None):
|
||||
"""Find entities in the MISC column according to the pattern and map to
|
||||
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
|
||||
the pattern is not matched.
|
||||
|
||||
lines (str): CONLL-U lines for one sentences
|
||||
tag_pattern (str): Regex pattern for entity tag
|
||||
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
|
||||
RETURNS (list): List of BILUO entity tags
|
||||
"""
|
||||
miscs = []
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
if "-" in id_ or "." in id_:
|
||||
continue
|
||||
miscs.append(misc)
|
||||
|
||||
iob = []
|
||||
for misc in miscs:
|
||||
iob_tag = "O"
|
||||
for misc_part in misc.split("|"):
|
||||
tag_match = re.match(tag_pattern, misc_part)
|
||||
if tag_match:
|
||||
prefix = tag_match.group(2)
|
||||
suffix = tag_match.group(3)
|
||||
if prefix and suffix:
|
||||
iob_tag = prefix + "-" + suffix
|
||||
if ner_map:
|
||||
suffix = ner_map.get(suffix, suffix)
|
||||
if suffix == "":
|
||||
iob_tag = "O"
|
||||
else:
|
||||
iob_tag = prefix + "-" + suffix
|
||||
break
|
||||
iob.append(iob_tag)
|
||||
return iob_to_biluo(iob)
|
||||
|
||||
|
||||
def simplify_tags(iob):
|
||||
"""
|
||||
Simplify tags obtained from the dataset in order to follow Wikipedia
|
||||
scheme (PER, LOC, ORG, MISC). 'PER', 'LOC' and 'ORG' keep their tags, while
|
||||
'GPE_LOC' is simplified to 'LOC', 'GPE_ORG' to 'ORG' and all remaining tags to
|
||||
'MISC'.
|
||||
"""
|
||||
new_iob = []
|
||||
for tag in iob:
|
||||
tag_match = re.match("([A-Z_]+)-([A-Z_]+)", tag)
|
||||
if tag_match:
|
||||
prefix = tag_match.group(1)
|
||||
suffix = tag_match.group(2)
|
||||
if suffix == "GPE_LOC":
|
||||
suffix = "LOC"
|
||||
elif suffix == "GPE_ORG":
|
||||
suffix = "ORG"
|
||||
elif suffix != "PER" and suffix != "LOC" and suffix != "ORG":
|
||||
suffix = "MISC"
|
||||
tag = prefix + "-" + suffix
|
||||
new_iob.append(tag)
|
||||
return new_iob
|
||||
|
||||
|
||||
def generate_sentence(sent, has_ner_tags):
|
||||
(id_, word, tag, head, dep, iob) = sent
|
||||
def generate_sentence(token_annotation, has_ner_tags, tag_pattern, ner_map=None):
|
||||
sentence = {}
|
||||
tokens = []
|
||||
if has_ner_tags:
|
||||
iob = simplify_tags(iob)
|
||||
biluo = iob_to_biluo(iob)
|
||||
for i, id in enumerate(id_):
|
||||
for i, id_ in enumerate(token_annotation.ids):
|
||||
token = {}
|
||||
token["id"] = id
|
||||
token["orth"] = word[i]
|
||||
token["tag"] = tag[i]
|
||||
token["head"] = head[i] - id
|
||||
token["dep"] = dep[i]
|
||||
token["id"] = id_
|
||||
token["orth"] = token_annotation.get_word(i)
|
||||
token["tag"] = token_annotation.get_tag(i)
|
||||
token["pos"] = token_annotation.get_pos(i)
|
||||
token["lemma"] = token_annotation.get_lemma(i)
|
||||
token["morph"] = token_annotation.get_morph(i)
|
||||
token["head"] = token_annotation.get_head(i) - id_
|
||||
token["dep"] = token_annotation.get_dep(i)
|
||||
if has_ner_tags:
|
||||
token["ner"] = biluo[i]
|
||||
token["ner"] = token_annotation.get_entity(i)
|
||||
tokens.append(token)
|
||||
sentence["tokens"] = tokens
|
||||
return sentence
|
||||
|
||||
|
||||
def create_doc(sentences, id):
|
||||
def create_json_doc(raw, sentences, id_):
|
||||
doc = {}
|
||||
paragraph = {}
|
||||
doc["id"] = id
|
||||
doc["id"] = id_
|
||||
doc["paragraphs"] = []
|
||||
paragraph["raw"] = raw.strip()
|
||||
paragraph["sentences"] = sentences
|
||||
doc["paragraphs"].append(paragraph)
|
||||
return doc
|
||||
|
||||
|
||||
def example_from_conllu_sentence(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
merge_subtokens=False,
|
||||
append_morphology=False,
|
||||
ner_map=None,
|
||||
):
|
||||
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
||||
subtokens and appending morphology to tags if required.
|
||||
|
||||
lines (str): The non-comment lines for a CoNLL-U sentence
|
||||
ner_tag_pattern (str): The regex pattern for matching NER in MISC col
|
||||
RETURNS (Example): An example containing the annotation
|
||||
"""
|
||||
# create a Doc with each subtoken as its own token
|
||||
# if merging subtokens, each subtoken orth is the merged subtoken form
|
||||
if not Token.has_extension("merged_orth"):
|
||||
Token.set_extension("merged_orth", default="")
|
||||
if not Token.has_extension("merged_lemma"):
|
||||
Token.set_extension("merged_lemma", default="")
|
||||
if not Token.has_extension("merged_morph"):
|
||||
Token.set_extension("merged_morph", default="")
|
||||
if not Token.has_extension("merged_spaceafter"):
|
||||
Token.set_extension("merged_spaceafter", default="")
|
||||
words, spaces, tags, poses, morphs, lemmas = [], [], [], [], [], []
|
||||
heads, deps = [], []
|
||||
subtok_word = ""
|
||||
in_subtok = False
|
||||
for i in range(len(lines)):
|
||||
line = lines[i]
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
if "." in id_:
|
||||
continue
|
||||
if "-" in id_:
|
||||
in_subtok = True
|
||||
if "-" in id_:
|
||||
in_subtok = True
|
||||
subtok_word = word
|
||||
subtok_start, subtok_end = id_.split("-")
|
||||
subtok_spaceafter = "SpaceAfter=No" not in misc
|
||||
continue
|
||||
if merge_subtokens and in_subtok:
|
||||
words.append(subtok_word)
|
||||
else:
|
||||
words.append(word)
|
||||
if in_subtok:
|
||||
if id_ == subtok_end:
|
||||
spaces.append(subtok_spaceafter)
|
||||
else:
|
||||
spaces.append(False)
|
||||
elif "SpaceAfter=No" in misc:
|
||||
spaces.append(False)
|
||||
else:
|
||||
spaces.append(True)
|
||||
if in_subtok and id_ == subtok_end:
|
||||
subtok_word = ""
|
||||
in_subtok = False
|
||||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head not in ("0", "_") else id_
|
||||
tag = pos if tag == "_" else tag
|
||||
morph = morph if morph != "_" else ""
|
||||
dep = "ROOT" if dep == "root" else dep
|
||||
lemmas.append(lemma)
|
||||
poses.append(pos)
|
||||
tags.append(tag)
|
||||
morphs.append(morph)
|
||||
heads.append(head)
|
||||
deps.append(dep)
|
||||
|
||||
doc = Doc(vocab, words=words, spaces=spaces)
|
||||
for i in range(len(doc)):
|
||||
doc[i].tag_ = tags[i]
|
||||
doc[i].pos_ = poses[i]
|
||||
doc[i].dep_ = deps[i]
|
||||
doc[i].lemma_ = lemmas[i]
|
||||
doc[i].head = doc[heads[i]]
|
||||
doc[i]._.merged_orth = words[i]
|
||||
doc[i]._.merged_morph = morphs[i]
|
||||
doc[i]._.merged_lemma = lemmas[i]
|
||||
doc[i]._.merged_spaceafter = spaces[i]
|
||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||
doc.ents = spans_from_biluo_tags(doc, ents)
|
||||
doc.is_parsed = True
|
||||
doc.is_tagged = True
|
||||
|
||||
if merge_subtokens:
|
||||
doc = merge_conllu_subtokens(lines, doc)
|
||||
|
||||
# create Example from custom Doc annotation
|
||||
ids, words, tags, heads, deps = [], [], [], [], []
|
||||
pos, lemmas, morphs, spaces = [], [], [], []
|
||||
for i, t in enumerate(doc):
|
||||
ids.append(i)
|
||||
words.append(t._.merged_orth)
|
||||
if append_morphology and t._.merged_morph:
|
||||
tags.append(t.tag_ + "__" + t._.merged_morph)
|
||||
else:
|
||||
tags.append(t.tag_)
|
||||
pos.append(t.pos_)
|
||||
morphs.append(t._.merged_morph)
|
||||
lemmas.append(t._.merged_lemma)
|
||||
heads.append(t.head.i)
|
||||
deps.append(t.dep_)
|
||||
spaces.append(t._.merged_spaceafter)
|
||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||
ents = biluo_tags_from_offsets(doc, ent_offsets)
|
||||
raw = ""
|
||||
for word, space in zip(words, spaces):
|
||||
raw += word
|
||||
if space:
|
||||
raw += " "
|
||||
example = Example(doc=raw)
|
||||
example.set_token_annotation(
|
||||
ids=ids,
|
||||
words=words,
|
||||
tags=tags,
|
||||
pos=pos,
|
||||
morphs=morphs,
|
||||
lemmas=lemmas,
|
||||
heads=heads,
|
||||
deps=deps,
|
||||
entities=ents,
|
||||
)
|
||||
return example
|
||||
|
||||
|
||||
def merge_conllu_subtokens(lines, doc):
|
||||
# identify and process all subtoken spans to prepare attrs for merging
|
||||
subtok_spans = []
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
id_, word, lemma, pos, tag, morph, head, dep, _1, misc = parts
|
||||
if "-" in id_:
|
||||
subtok_start, subtok_end = id_.split("-")
|
||||
subtok_span = doc[int(subtok_start) - 1 : int(subtok_end)]
|
||||
subtok_spans.append(subtok_span)
|
||||
# create merged tag, morph, and lemma values
|
||||
tags = []
|
||||
morphs = {}
|
||||
lemmas = []
|
||||
for token in subtok_span:
|
||||
tags.append(token.tag_)
|
||||
lemmas.append(token.lemma_)
|
||||
if token._.merged_morph:
|
||||
for feature in token._.merged_morph.split("|"):
|
||||
field, values = feature.split("=", 1)
|
||||
if field not in morphs:
|
||||
morphs[field] = set()
|
||||
for value in values.split(","):
|
||||
morphs[field].add(value)
|
||||
# create merged features for each morph field
|
||||
for field, values in morphs.items():
|
||||
morphs[field] = field + "=" + ",".join(sorted(values))
|
||||
# set the same attrs on all subtok tokens so that whatever head the
|
||||
# retokenizer chooses, the final attrs are available on that token
|
||||
for token in subtok_span:
|
||||
token._.merged_orth = token.orth_
|
||||
token._.merged_lemma = " ".join(lemmas)
|
||||
token.tag_ = "_".join(tags)
|
||||
token._.merged_morph = "|".join(sorted(morphs.values()))
|
||||
token._.merged_spaceafter = (
|
||||
True if subtok_span[-1].whitespace_ else False
|
||||
)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
for span in subtok_spans:
|
||||
retokenizer.merge(span)
|
||||
|
||||
return doc
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from wasabi import Printer
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import srsly
|
||||
|
||||
from ...gold import docs_to_json
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import plac
|
||||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES
|
||||
|
@ -22,29 +18,18 @@ BLANK_MODEL_MIN_THRESHOLD = 100
|
|||
BLANK_MODEL_THRESHOLD = 2000
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
lang=("model language", "positional", None, str),
|
||||
train_path=("location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("location of JSON-formatted development data", "positional", None, Path),
|
||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||
base_model=("name of model to update (optional)", "option", "b", str),
|
||||
pipeline=("Comma-separated names of pipeline components to train", "option", "p", str),
|
||||
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
|
||||
verbose=("Print additional information and explanations", "flag", "V", bool),
|
||||
no_format=("Don't pretty-print the results", "flag", "NF", bool),
|
||||
# fmt: on
|
||||
)
|
||||
def debug_data(
|
||||
lang,
|
||||
train_path,
|
||||
dev_path,
|
||||
tag_map_path=None,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
ignore_warnings=False,
|
||||
verbose=False,
|
||||
no_format=False,
|
||||
# fmt: off
|
||||
lang: ("Model language", "positional", None, str),
|
||||
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
||||
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
||||
base_model: ("Name of model to update (optional)", "option", "b", str) = None,
|
||||
pipeline: ("Comma-separated names of pipeline components to train", "option", "p", str) = "tagger,parser,ner",
|
||||
ignore_warnings: ("Ignore warnings, only show stats and errors", "flag", "IW", bool) = False,
|
||||
verbose: ("Print additional information and explanations", "flag", "V", bool) = False,
|
||||
no_format: ("Don't pretty-print the results", "flag", "NF", bool) = False,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Analyze, debug and validate your training and development data, get useful
|
||||
|
@ -85,20 +70,16 @@ def debug_data(
|
|||
with msg.loading("Loading corpus..."):
|
||||
corpus = GoldCorpus(train_path, dev_path)
|
||||
try:
|
||||
train_docs = list(corpus.train_docs(nlp))
|
||||
train_docs_unpreprocessed = list(
|
||||
corpus.train_docs_without_preprocessing(nlp)
|
||||
train_dataset = list(corpus.train_dataset(nlp))
|
||||
train_dataset_unpreprocessed = list(
|
||||
corpus.train_dataset_without_preprocessing(nlp)
|
||||
)
|
||||
except ValueError as e:
|
||||
loading_train_error_message = "Training data cannot be loaded: {}".format(
|
||||
str(e)
|
||||
)
|
||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
||||
try:
|
||||
dev_docs = list(corpus.dev_docs(nlp))
|
||||
dev_dataset = list(corpus.dev_dataset(nlp))
|
||||
except ValueError as e:
|
||||
loading_dev_error_message = "Development data cannot be loaded: {}".format(
|
||||
str(e)
|
||||
)
|
||||
loading_dev_error_message = f"Development data cannot be loaded: {e}"
|
||||
if loading_train_error_message or loading_dev_error_message:
|
||||
if loading_train_error_message:
|
||||
msg.fail(loading_train_error_message)
|
||||
|
@ -107,82 +88,68 @@ def debug_data(
|
|||
sys.exit(1)
|
||||
msg.good("Corpus is loadable")
|
||||
|
||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
||||
gold_train_data = _compile_gold(train_docs, pipeline, nlp)
|
||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||
gold_train_data = _compile_gold(train_dataset, pipeline, nlp)
|
||||
gold_train_unpreprocessed_data = _compile_gold(
|
||||
train_docs_unpreprocessed, pipeline, nlp
|
||||
train_dataset_unpreprocessed, pipeline
|
||||
)
|
||||
gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)
|
||||
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp)
|
||||
|
||||
train_texts = gold_train_data["texts"]
|
||||
dev_texts = gold_dev_data["texts"]
|
||||
|
||||
msg.divider("Training stats")
|
||||
msg.text("Training pipeline: {}".format(", ".join(pipeline)))
|
||||
msg.text(f"Training pipeline: {', '.join(pipeline)}")
|
||||
for pipe in [p for p in pipeline if p not in nlp.factories]:
|
||||
msg.fail("Pipeline component '{}' not available in factories".format(pipe))
|
||||
msg.fail(f"Pipeline component '{pipe}' not available in factories")
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
msg.text(f"Starting with base model '{base_model}'")
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
msg.text("{} training docs".format(len(train_docs)))
|
||||
msg.text("{} evaluation docs".format(len(dev_docs)))
|
||||
msg.text(f"Starting with blank model '{lang}'")
|
||||
msg.text(f"{len(train_dataset)} training docs")
|
||||
msg.text(f"{len(dev_dataset)} evaluation docs")
|
||||
|
||||
if not len(dev_docs):
|
||||
if not len(gold_dev_data):
|
||||
msg.fail("No evaluation docs")
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
msg.warn(f"{overlap} training examples also in evaluation data")
|
||||
else:
|
||||
msg.good("No overlap between training and evaluation data")
|
||||
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
||||
text = "Low number of examples to train from a blank model ({})".format(
|
||||
len(train_docs)
|
||||
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||
text = (
|
||||
f"Low number of examples to train from a blank model ({len(train_dataset)})"
|
||||
)
|
||||
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||
msg.fail(text)
|
||||
else:
|
||||
msg.warn(text)
|
||||
msg.text(
|
||||
"It's recommended to use at least {} examples (minimum {})".format(
|
||||
BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
|
||||
),
|
||||
f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples "
|
||||
f"(minimum {BLANK_MODEL_MIN_THRESHOLD})",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
msg.divider("Vocab & Vectors")
|
||||
n_words = gold_train_data["n_words"]
|
||||
msg.info(
|
||||
"{} total {} in the data ({} unique)".format(
|
||||
n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"])
|
||||
)
|
||||
f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)"
|
||||
)
|
||||
if gold_train_data["n_misaligned_words"] > 0:
|
||||
msg.warn(
|
||||
"{} misaligned tokens in the training data".format(
|
||||
gold_train_data["n_misaligned_words"]
|
||||
)
|
||||
)
|
||||
n_misaligned = gold_train_data["n_misaligned_words"]
|
||||
msg.warn(f"{n_misaligned} misaligned tokens in the training data")
|
||||
if gold_dev_data["n_misaligned_words"] > 0:
|
||||
msg.warn(
|
||||
"{} misaligned tokens in the dev data".format(
|
||||
gold_dev_data["n_misaligned_words"]
|
||||
)
|
||||
)
|
||||
n_misaligned = gold_dev_data["n_misaligned_words"]
|
||||
msg.warn(f"{n_misaligned} misaligned tokens in the dev data")
|
||||
most_common_words = gold_train_data["words"].most_common(10)
|
||||
msg.text(
|
||||
"10 most common words: {}".format(
|
||||
_format_labels(most_common_words, counts=True)
|
||||
),
|
||||
f"10 most common words: {_format_labels(most_common_words, counts=True)}",
|
||||
show=verbose,
|
||||
)
|
||||
if len(nlp.vocab.vectors):
|
||||
msg.info(
|
||||
"{} vectors ({} unique keys, {} dimensions)".format(
|
||||
len(nlp.vocab.vectors),
|
||||
nlp.vocab.vectors.n_keys,
|
||||
nlp.vocab.vectors_length,
|
||||
)
|
||||
f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} "
|
||||
f"unique keys, {nlp.vocab.vectors_length} dimensions)"
|
||||
)
|
||||
n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
|
||||
msg.warn(
|
||||
|
@ -205,7 +172,7 @@ def debug_data(
|
|||
if "ner" in pipeline:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-")
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
||||
)
|
||||
label_counts = gold_train_data["ner"]
|
||||
model_labels = _get_labels_from_model(nlp, "ner")
|
||||
|
@ -218,19 +185,10 @@ def debug_data(
|
|||
|
||||
msg.divider("Named Entity Recognition")
|
||||
msg.info(
|
||||
"{} new {}, {} existing {}".format(
|
||||
len(new_labels),
|
||||
"label" if len(new_labels) == 1 else "labels",
|
||||
len(existing_labels),
|
||||
"label" if len(existing_labels) == 1 else "labels",
|
||||
)
|
||||
f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
|
||||
)
|
||||
missing_values = label_counts["-"]
|
||||
msg.text(
|
||||
"{} missing {} (tokens with '-' label)".format(
|
||||
missing_values, "value" if missing_values == 1 else "values"
|
||||
)
|
||||
)
|
||||
msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
|
||||
for label in new_labels:
|
||||
if len(label) == 0:
|
||||
msg.fail("Empty label found in new labels")
|
||||
|
@ -241,43 +199,28 @@ def debug_data(
|
|||
if label != "-"
|
||||
]
|
||||
labels_with_counts = _format_labels(labels_with_counts, counts=True)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
|
||||
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||
if gold_train_data["ws_ents"]:
|
||||
msg.fail(
|
||||
"{} invalid whitespace entity span(s)".format(
|
||||
gold_train_data["ws_ents"]
|
||||
)
|
||||
)
|
||||
msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
|
||||
has_ws_ents_error = True
|
||||
|
||||
if gold_train_data["punct_ents"]:
|
||||
msg.warn(
|
||||
"{} entity span(s) with punctuation".format(
|
||||
gold_train_data["punct_ents"]
|
||||
)
|
||||
)
|
||||
msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
|
||||
has_punct_ents_warning = True
|
||||
|
||||
for label in new_labels:
|
||||
if label_counts[label] <= NEW_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for new label '{}' ({})".format(
|
||||
label, label_counts[label]
|
||||
)
|
||||
f"Low number of examples for new label '{label}' ({label_counts[label]})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
with msg.loading("Analyzing label distribution..."):
|
||||
neg_docs = _get_examples_without_label(train_docs, label)
|
||||
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||
if neg_docs == 0:
|
||||
msg.warn(
|
||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||
)
|
||||
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
|
||||
has_no_neg_warning = True
|
||||
|
||||
if not has_low_data_warning:
|
||||
|
@ -291,8 +234,8 @@ def debug_data(
|
|||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a new entity type, your data should include at "
|
||||
"least {} instances of the new label".format(NEW_LABEL_THRESHOLD),
|
||||
f"To train a new entity type, your data should include at "
|
||||
f"least {NEW_LABEL_THRESHOLD} instances of the new label",
|
||||
show=verbose,
|
||||
)
|
||||
if has_no_neg_warning:
|
||||
|
@ -321,27 +264,21 @@ def debug_data(
|
|||
new_labels = [l for l in labels if l not in model_labels]
|
||||
existing_labels = [l for l in labels if l in model_labels]
|
||||
msg.info(
|
||||
"Text Classification: {} new label(s), {} existing label(s)".format(
|
||||
len(new_labels), len(existing_labels)
|
||||
)
|
||||
f"Text Classification: {len(new_labels)} new label(s), "
|
||||
f"{len(existing_labels)} existing label(s)"
|
||||
)
|
||||
if new_labels:
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["cats"].most_common(), counts=True
|
||||
)
|
||||
msg.text("New: {}".format(labels_with_counts), show=verbose)
|
||||
msg.text(f"New: {labels_with_counts}", show=verbose)
|
||||
if existing_labels:
|
||||
msg.text(
|
||||
"Existing: {}".format(_format_labels(existing_labels)), show=verbose
|
||||
)
|
||||
msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
|
||||
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
|
||||
msg.fail(
|
||||
"The train and dev labels are not the same. "
|
||||
"Train labels: {}. "
|
||||
"Dev labels: {}.".format(
|
||||
_format_labels(gold_train_data["cats"]),
|
||||
_format_labels(gold_dev_data["cats"]),
|
||||
)
|
||||
f"The train and dev labels are not the same. "
|
||||
f"Train labels: {_format_labels(gold_train_data['cats'])}. "
|
||||
f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
|
||||
)
|
||||
if gold_train_data["n_cats_multilabel"] > 0:
|
||||
msg.info(
|
||||
|
@ -371,27 +308,16 @@ def debug_data(
|
|||
msg.divider("Part-of-speech Tagging")
|
||||
labels = [label for label in gold_train_data["tags"]]
|
||||
tag_map = nlp.vocab.morphology.tag_map
|
||||
msg.info(
|
||||
"{} {} in data ({} {} in tag map)".format(
|
||||
len(labels),
|
||||
"label" if len(labels) == 1 else "labels",
|
||||
len(tag_map),
|
||||
"label" if len(tag_map) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_data["tags"].most_common(), counts=True
|
||||
)
|
||||
msg.text(labels_with_counts, show=verbose)
|
||||
non_tagmap = [l for l in labels if l not in tag_map]
|
||||
if not non_tagmap:
|
||||
msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
|
||||
msg.good(f"All labels present in tag map for language '{nlp.lang}'")
|
||||
for label in non_tagmap:
|
||||
msg.fail(
|
||||
"Label '{}' not found in tag map for language '{}'".format(
|
||||
label, nlp.lang
|
||||
)
|
||||
)
|
||||
msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'")
|
||||
|
||||
if "parser" in pipeline:
|
||||
has_low_data_warning = False
|
||||
|
@ -399,21 +325,18 @@ def debug_data(
|
|||
|
||||
# profile sentence length
|
||||
msg.info(
|
||||
"Found {} sentence{} with an average length of {:.1f} words.".format(
|
||||
gold_train_data["n_sents"],
|
||||
"s" if len(train_docs) > 1 else "",
|
||||
gold_train_data["n_words"] / gold_train_data["n_sents"],
|
||||
)
|
||||
f"Found {gold_train_data['n_sents']} sentence(s) with an average "
|
||||
f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words."
|
||||
)
|
||||
|
||||
# check for documents with multiple sentences
|
||||
sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
|
||||
if sents_per_doc < 1.1:
|
||||
msg.warn(
|
||||
"The training data contains {:.2f} sentences per "
|
||||
"document. When there are very few documents containing more "
|
||||
"than one sentence, the parser will not learn how to segment "
|
||||
"longer texts into sentences.".format(sents_per_doc)
|
||||
f"The training data contains {sents_per_doc:.2f} sentences per "
|
||||
f"document. When there are very few documents containing more "
|
||||
f"than one sentence, the parser will not learn how to segment "
|
||||
f"longer texts into sentences."
|
||||
)
|
||||
|
||||
# profile labels
|
||||
|
@ -424,32 +347,13 @@ def debug_data(
|
|||
labels_dev = [label for label in gold_dev_data["deps"]]
|
||||
|
||||
if gold_train_unpreprocessed_data["n_nonproj"] > 0:
|
||||
msg.info(
|
||||
"Found {} nonprojective train sentence{}".format(
|
||||
gold_train_unpreprocessed_data["n_nonproj"],
|
||||
"s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
)
|
||||
n_nonproj = gold_train_unpreprocessed_data["n_nonproj"]
|
||||
msg.info(f"Found {n_nonproj} nonprojective train sentence(s)")
|
||||
if gold_dev_data["n_nonproj"] > 0:
|
||||
msg.info(
|
||||
"Found {} nonprojective dev sentence{}".format(
|
||||
gold_dev_data["n_nonproj"],
|
||||
"s" if gold_dev_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
)
|
||||
|
||||
msg.info(
|
||||
"{} {} in train data".format(
|
||||
len(labels_train_unpreprocessed),
|
||||
"label" if len(labels_train) == 1 else "labels",
|
||||
)
|
||||
)
|
||||
msg.info(
|
||||
"{} {} in projectivized train data".format(
|
||||
len(labels_train), "label" if len(labels_train) == 1 else "labels"
|
||||
)
|
||||
)
|
||||
|
||||
n_nonproj = gold_dev_data["n_nonproj"]
|
||||
msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)")
|
||||
msg.info(f"{labels_train_unpreprocessed} label(s) in train data")
|
||||
msg.info(f"{len(labels_train)} label(s) in projectivized train data")
|
||||
labels_with_counts = _format_labels(
|
||||
gold_train_unpreprocessed_data["deps"].most_common(), counts=True
|
||||
)
|
||||
|
@ -459,9 +363,8 @@ def debug_data(
|
|||
for label in gold_train_unpreprocessed_data["deps"]:
|
||||
if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD:
|
||||
msg.warn(
|
||||
"Low number of examples for label '{}' ({})".format(
|
||||
label, gold_train_unpreprocessed_data["deps"][label]
|
||||
)
|
||||
f"Low number of examples for label '{label}' "
|
||||
f"({gold_train_unpreprocessed_data['deps'][label]})"
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
||||
|
@ -470,22 +373,19 @@ def debug_data(
|
|||
for label in gold_train_data["deps"]:
|
||||
if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
|
||||
rare_projectivized_labels.append(
|
||||
"{}: {}".format(label, str(gold_train_data["deps"][label]))
|
||||
f"{label}: {gold_train_data['deps'][label]}"
|
||||
)
|
||||
|
||||
if len(rare_projectivized_labels) > 0:
|
||||
msg.warn(
|
||||
"Low number of examples for {} label{} in the "
|
||||
"projectivized dependency trees used for training. You may "
|
||||
"want to projectivize labels such as punct before "
|
||||
"training in order to improve parser performance.".format(
|
||||
len(rare_projectivized_labels),
|
||||
"s" if len(rare_projectivized_labels) > 1 else "",
|
||||
)
|
||||
f"Low number of examples for {len(rare_projectivized_labels)} "
|
||||
"label(s) in the projectivized dependency trees used for "
|
||||
"training. You may want to projectivize labels such as punct "
|
||||
"before training in order to improve parser performance."
|
||||
)
|
||||
msg.warn(
|
||||
"Projectivized labels with low numbers of examples: "
|
||||
"{}".format("\n".join(rare_projectivized_labels)),
|
||||
f"Projectivized labels with low numbers of examples: ",
|
||||
", ".join(rare_projectivized_labels),
|
||||
show=verbose,
|
||||
)
|
||||
has_low_data_warning = True
|
||||
|
@ -493,50 +393,44 @@ def debug_data(
|
|||
# labels only in train
|
||||
if set(labels_train) - set(labels_dev):
|
||||
msg.warn(
|
||||
"The following labels were found only in the train data: "
|
||||
"{}".format(", ".join(set(labels_train) - set(labels_dev))),
|
||||
"The following labels were found only in the train data:",
|
||||
", ".join(set(labels_train) - set(labels_dev)),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# labels only in dev
|
||||
if set(labels_dev) - set(labels_train):
|
||||
msg.warn(
|
||||
"The following labels were found only in the dev data: "
|
||||
+ ", ".join(set(labels_dev) - set(labels_train)),
|
||||
"The following labels were found only in the dev data:",
|
||||
", ".join(set(labels_dev) - set(labels_train)),
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
if has_low_data_warning:
|
||||
msg.text(
|
||||
"To train a parser, your data should include at "
|
||||
"least {} instances of each label.".format(DEP_LABEL_THRESHOLD),
|
||||
f"To train a parser, your data should include at "
|
||||
f"least {DEP_LABEL_THRESHOLD} instances of each label.",
|
||||
show=verbose,
|
||||
)
|
||||
|
||||
# multiple root labels
|
||||
if len(gold_train_unpreprocessed_data["roots"]) > 1:
|
||||
msg.warn(
|
||||
"Multiple root labels ({}) ".format(
|
||||
", ".join(gold_train_unpreprocessed_data["roots"])
|
||||
)
|
||||
+ "found in training data. spaCy's parser uses a single root "
|
||||
"label ROOT so this distinction will not be available."
|
||||
f"Multiple root labels "
|
||||
f"({', '.join(gold_train_unpreprocessed_data['roots'])}) "
|
||||
f"found in training data. spaCy's parser uses a single root "
|
||||
f"label ROOT so this distinction will not be available."
|
||||
)
|
||||
|
||||
# these should not happen, but just in case
|
||||
if gold_train_data["n_nonproj"] > 0:
|
||||
msg.fail(
|
||||
"Found {} nonprojective projectivized train sentence{}".format(
|
||||
gold_train_data["n_nonproj"],
|
||||
"s" if gold_train_data["n_nonproj"] > 1 else "",
|
||||
)
|
||||
f"Found {gold_train_data['n_nonproj']} nonprojective "
|
||||
f"projectivized train sentence(s)"
|
||||
)
|
||||
if gold_train_data["n_cycles"] > 0:
|
||||
msg.fail(
|
||||
"Found {} projectivized train sentence{} with cycles".format(
|
||||
gold_train_data["n_cycles"],
|
||||
"s" if gold_train_data["n_cycles"] > 1 else "",
|
||||
)
|
||||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
msg.divider("Summary")
|
||||
|
@ -544,42 +438,34 @@ def debug_data(
|
|||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
fail_counts = msg.counts[MESSAGES.FAIL]
|
||||
if good_counts:
|
||||
msg.good(
|
||||
"{} {} passed".format(
|
||||
good_counts, "check" if good_counts == 1 else "checks"
|
||||
)
|
||||
)
|
||||
msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed")
|
||||
if warn_counts:
|
||||
msg.warn(
|
||||
"{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
|
||||
)
|
||||
if fail_counts:
|
||||
msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))
|
||||
|
||||
msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}")
|
||||
if fail_counts:
|
||||
msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def _load_file(file_path, msg):
|
||||
file_name = file_path.parts[-1]
|
||||
if file_path.suffix == ".json":
|
||||
with msg.loading("Loading {}...".format(file_name)):
|
||||
with msg.loading(f"Loading {file_name}..."):
|
||||
data = srsly.read_json(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
msg.good(f"Loaded {file_name}")
|
||||
return data
|
||||
elif file_path.suffix == ".jsonl":
|
||||
with msg.loading("Loading {}...".format(file_name)):
|
||||
with msg.loading(f"Loading {file_name}..."):
|
||||
data = srsly.read_jsonl(file_path)
|
||||
msg.good("Loaded {}".format(file_name))
|
||||
msg.good(f"Loaded {file_name}")
|
||||
return data
|
||||
msg.fail(
|
||||
"Can't load file extension {}".format(file_path.suffix),
|
||||
f"Can't load file extension {file_path.suffix}",
|
||||
"Expected .json or .jsonl",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
|
||||
def _compile_gold(train_docs, pipeline, nlp):
|
||||
def _compile_gold(examples, pipeline, nlp):
|
||||
data = {
|
||||
"ner": Counter(),
|
||||
"cats": Counter(),
|
||||
|
@ -598,7 +484,9 @@ def _compile_gold(train_docs, pipeline, nlp):
|
|||
"n_cats_multilabel": 0,
|
||||
"texts": set(),
|
||||
}
|
||||
for doc, gold in train_docs:
|
||||
for example in examples:
|
||||
gold = example.gold
|
||||
doc = example.doc
|
||||
valid_words = [x for x in gold.words if x is not None]
|
||||
data["words"].update(valid_words)
|
||||
data["n_words"] += len(valid_words)
|
||||
|
@ -651,17 +539,17 @@ def _compile_gold(train_docs, pipeline, nlp):
|
|||
|
||||
def _format_labels(labels, counts=False):
|
||||
if counts:
|
||||
return ", ".join(["'{}' ({})".format(l, c) for l, c in labels])
|
||||
return ", ".join(["'{}'".format(l) for l in labels])
|
||||
return ", ".join([f"'{l}' ({c})" for l, c in labels])
|
||||
return ", ".join([f"'{l}'" for l in labels])
|
||||
|
||||
|
||||
def _get_examples_without_label(data, label):
|
||||
count = 0
|
||||
for doc, gold in data:
|
||||
for ex in data:
|
||||
labels = [
|
||||
label.split("-")[1]
|
||||
for label in gold.ner
|
||||
if label is not None and label not in ("O", "-")
|
||||
for label in ex.gold.ner
|
||||
if label not in ("O", "-", None)
|
||||
]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
|
|
|
@ -1,30 +1,24 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import requests
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from wasabi import msg
|
||||
|
||||
from .link import link
|
||||
from ..util import get_package_path
|
||||
from .. import about
|
||||
from ..util import is_package, get_base_version
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model to download (shortcut or name)", "positional", None, str),
|
||||
direct=("Force direct download of name + version", "flag", "d", bool),
|
||||
pip_args=("Additional arguments to be passed to `pip install` on model install"),
|
||||
)
|
||||
def download(model, direct=False, *pip_args):
|
||||
def download(
|
||||
model: ("Model to download (shortcut or name)", "positional", None, str),
|
||||
direct: ("Force direct download of name + version", "flag", "d", bool) = False,
|
||||
*pip_args: ("Additional arguments to be passed to `pip install` on model install"),
|
||||
):
|
||||
"""
|
||||
Download compatible model from default download path using pip. Model
|
||||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version. For direct downloads, the compatibility check will be skipped.
|
||||
Download compatible model from default download path using pip. If --direct
|
||||
flag is set, the command expects the full model name with version.
|
||||
For direct downloads, the compatibility check will be skipped.
|
||||
"""
|
||||
if not require_package("spacy") and "--no-deps" not in pip_args:
|
||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||
msg.warn(
|
||||
"Skipping model package dependencies and setting `--no-deps`. "
|
||||
"You don't seem to have the spaCy package itself installed "
|
||||
|
@ -50,77 +44,38 @@ def download(model, direct=False, *pip_args):
|
|||
sys.exit(dl)
|
||||
msg.good(
|
||||
"Download and installation successful",
|
||||
"You can now load the model via spacy.load('{}')".format(model_name),
|
||||
f"You can now load the model via spacy.load('{model_name}')",
|
||||
)
|
||||
# Only create symlink if the model is installed via a shortcut like 'en'.
|
||||
# There's no real advantage over an additional symlink for en_core_web_sm
|
||||
# and if anything, it's more error prone and causes more confusion.
|
||||
if model in shortcuts:
|
||||
try:
|
||||
# Get package path here because link uses
|
||||
# pip.get_installed_distributions() to check if model is a
|
||||
# package, which fails if model was just installed via
|
||||
# subprocess
|
||||
package_path = get_package_path(model_name)
|
||||
link(model_name, model, force=True, model_path=package_path)
|
||||
except: # noqa: E722
|
||||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
msg.warn(
|
||||
"Download successful but linking failed",
|
||||
"Creating a shortcut link for '{}' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name: "
|
||||
"nlp = spacy.load('{}')".format(model, model_name),
|
||||
)
|
||||
# If a model is downloaded and then loaded within the same process, our
|
||||
# is_package check currently fails, because pkg_resources.working_set
|
||||
# is not refreshed automatically (see #3923). We're trying to work
|
||||
# around this here be requiring the package explicitly.
|
||||
require_package(model_name)
|
||||
|
||||
|
||||
def require_package(name):
|
||||
try:
|
||||
import pkg_resources
|
||||
|
||||
pkg_resources.working_set.require(name)
|
||||
return True
|
||||
except: # noqa: E722
|
||||
return False
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
"Server error ({})".format(r.status_code),
|
||||
"Couldn't fetch {}. Please find a model for your spaCy "
|
||||
"installation (v{}), and download it manually. For more "
|
||||
"details, see the documentation: "
|
||||
"https://spacy.io/usage/models".format(desc, about.__version__),
|
||||
f"Server error ({r.status_code})",
|
||||
f"Couldn't fetch {desc}. Please find a model for your spaCy "
|
||||
f"installation (v{about.__version__}), and download it manually. "
|
||||
f"For more details, see the documentation: "
|
||||
f"https://spacy.io/usage/models",
|
||||
exits=1,
|
||||
)
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_compatibility():
|
||||
version = about.__version__
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
version = get_base_version(about.__version__)
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1)
|
||||
msg.fail(f"No compatible models found for v{version} of spaCy", exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
def get_version(model, comp):
|
||||
model = model.rsplit(".dev", 1)[0]
|
||||
model = get_base_version(model)
|
||||
if model not in comp:
|
||||
msg.fail(
|
||||
"No compatible model found for '{}' "
|
||||
"(spaCy v{}).".format(model, about.__version__),
|
||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
||||
exits=1,
|
||||
)
|
||||
return comp[model][0]
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
from timeit import default_timer as timer
|
||||
from wasabi import msg
|
||||
|
||||
|
@ -10,23 +6,16 @@ from .. import util
|
|||
from .. import displacy
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name or path", "positional", None, str),
|
||||
data_path=("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
gpu_id=("Use GPU", "option", "g", int),
|
||||
displacy_path=("Directory to output rendered parses as HTML", "option", "dp", str),
|
||||
displacy_limit=("Limit of parses to render as HTML", "option", "dl", int),
|
||||
return_scores=("Return dict containing model scores", "flag", "R", bool),
|
||||
)
|
||||
def evaluate(
|
||||
model,
|
||||
data_path,
|
||||
gpu_id=-1,
|
||||
gold_preproc=False,
|
||||
displacy_path=None,
|
||||
displacy_limit=25,
|
||||
return_scores=False,
|
||||
# fmt: off
|
||||
model: ("Model name or path", "positional", None, str),
|
||||
data_path: ("Location of JSON-formatted evaluation data", "positional", None, str),
|
||||
gpu_id: ("Use GPU", "option", "g", int) = -1,
|
||||
gold_preproc: ("Use gold preprocessing", "flag", "G", bool) = False,
|
||||
displacy_path: ("Directory to output rendered parses as HTML", "option", "dp", str) = None,
|
||||
displacy_limit: ("Limit of parses to render as HTML", "option", "dl", int) = 25,
|
||||
return_scores: ("Return dict containing model scores", "flag", "R", bool) = False,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Evaluate a model. To render a sample of parses in a HTML file, set an
|
||||
|
@ -47,28 +36,34 @@ def evaluate(
|
|||
nlp = util.get_lang_class(model.replace("blank:", ""))()
|
||||
else:
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
|
||||
begin = timer()
|
||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
||||
scorer = nlp.evaluate(dev_dataset, verbose=False)
|
||||
end = timer()
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
nwords = sum(len(ex.doc) for ex in dev_dataset)
|
||||
results = {
|
||||
"Time": "%.2f s" % (end - begin),
|
||||
"Time": f"{end - begin:.2f} s",
|
||||
"Words": nwords,
|
||||
"Words/s": "%.0f" % (nwords / (end - begin)),
|
||||
"TOK": "%.2f" % scorer.token_acc,
|
||||
"POS": "%.2f" % scorer.tags_acc,
|
||||
"UAS": "%.2f" % scorer.uas,
|
||||
"LAS": "%.2f" % scorer.las,
|
||||
"NER P": "%.2f" % scorer.ents_p,
|
||||
"NER R": "%.2f" % scorer.ents_r,
|
||||
"NER F": "%.2f" % scorer.ents_f,
|
||||
"Textcat": "%.2f" % scorer.textcat_score,
|
||||
"Words/s": f"{nwords / (end - begin):.0f}",
|
||||
"TOK": f"{scorer.token_acc:.2f}",
|
||||
"TAG": f"{scorer.tags_acc:.2f}",
|
||||
"POS": f"{scorer.pos_acc:.2f}",
|
||||
"MORPH": f"{scorer.morphs_acc:.2f}",
|
||||
"UAS": f"{scorer.uas:.2f}",
|
||||
"LAS": f"{scorer.las:.2f}",
|
||||
"NER P": f"{scorer.ents_p:.2f}",
|
||||
"NER R": f"{scorer.ents_r:.2f}",
|
||||
"NER F": f"{scorer.ents_f:.2f}",
|
||||
"Textcat AUC": f"{scorer.textcat_auc:.2f}",
|
||||
"Textcat F": f"{scorer.textcat_f:.2f}",
|
||||
"Sent P": f"{scorer.sent_p:.2f}",
|
||||
"Sent R": f"{scorer.sent_r:.2f}",
|
||||
"Sent F": f"{scorer.sent_f:.2f}",
|
||||
}
|
||||
msg.table(results, title="Results")
|
||||
|
||||
if displacy_path:
|
||||
docs, golds = zip(*dev_docs)
|
||||
docs = [ex.doc for ex in dev_dataset]
|
||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||
render_parses(
|
||||
|
@ -79,7 +74,7 @@ def evaluate(
|
|||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
)
|
||||
msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
|
||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||
if return_scores:
|
||||
return scorer.scores
|
||||
|
||||
|
|
|
@ -1,44 +1,39 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str, basestring_, unicode_
|
||||
from .validate import get_model_pkgs
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Optional shortcut link of model", "positional", None, str),
|
||||
markdown=("Generate Markdown for GitHub issues", "flag", "md", str),
|
||||
silent=("Don't print anything (just return)", "flag", "s"),
|
||||
)
|
||||
def info(model=None, markdown=False, silent=False):
|
||||
def info(
|
||||
model: ("Optional model name", "positional", None, str) = None,
|
||||
markdown: ("Generate Markdown for GitHub issues", "flag", "md", str) = False,
|
||||
silent: ("Don't print anything (just return)", "flag", "s") = False,
|
||||
):
|
||||
"""
|
||||
Print info about spaCy installation. If a model shortcut link is
|
||||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
Print info about spaCy installation. If a model is speficied as an argument,
|
||||
print model information. Flag --markdown prints details in Markdown for easy
|
||||
copy-pasting to GitHub issues.
|
||||
"""
|
||||
if model:
|
||||
if util.is_package(model):
|
||||
model_path = util.get_package_path(model)
|
||||
else:
|
||||
model_path = util.get_data_path() / model
|
||||
model_path = model
|
||||
meta_path = model_path / "meta.json"
|
||||
if not meta_path.is_file():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta["link"] = path2str(model_path)
|
||||
meta["source"] = path2str(model_path.resolve())
|
||||
meta["link"] = str(model_path)
|
||||
meta["source"] = str(model_path.resolve())
|
||||
else:
|
||||
meta["source"] = path2str(model_path)
|
||||
meta["source"] = str(model_path)
|
||||
if not silent:
|
||||
title = "Info about model '{}'".format(model)
|
||||
title = f"Info about model '{model}'"
|
||||
model_meta = {
|
||||
k: v for k, v in meta.items() if k not in ("accuracy", "speed")
|
||||
}
|
||||
|
@ -47,12 +42,15 @@ def info(model=None, markdown=False, silent=False):
|
|||
else:
|
||||
msg.table(model_meta, title=title)
|
||||
return meta
|
||||
all_models, _ = get_model_pkgs()
|
||||
data = {
|
||||
"spaCy version": about.__version__,
|
||||
"Location": path2str(Path(__file__).parent.parent),
|
||||
"Location": str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": list_models(),
|
||||
"Models": ", ".join(
|
||||
f"{m['name']} ({m['version']})" for m in all_models.values()
|
||||
),
|
||||
}
|
||||
if not silent:
|
||||
title = "Info about spaCy"
|
||||
|
@ -63,30 +61,17 @@ def info(model=None, markdown=False, silent=False):
|
|||
return data
|
||||
|
||||
|
||||
def list_models():
|
||||
def exclude_dir(dir_name):
|
||||
# exclude common cache directories and hidden directories
|
||||
exclude = ("cache", "pycache", "__pycache__")
|
||||
return dir_name in exclude or dir_name.startswith(".")
|
||||
|
||||
data_path = util.get_data_path()
|
||||
if data_path:
|
||||
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
|
||||
return ", ".join([m for m in models if not exclude_dir(m)])
|
||||
return "-"
|
||||
|
||||
|
||||
def print_markdown(data, title=None):
|
||||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||
|
||||
data (dict or list of tuples): Label/value pairs.
|
||||
title (unicode or None): Title, will be rendered as headline 2.
|
||||
title (str / None): Title, will be rendered as headline 2.
|
||||
"""
|
||||
markdown = []
|
||||
for key, value in data.items():
|
||||
if isinstance(value, basestring_) and Path(value).exists():
|
||||
if isinstance(value, str) and Path(value).exists():
|
||||
continue
|
||||
markdown.append("* **{}:** {}".format(key, unicode_(value)))
|
||||
markdown.append(f"* **{key}:** {value}")
|
||||
if title:
|
||||
print("\n## {}".format(title))
|
||||
print(f"\n## {title}")
|
||||
print("\n{}\n".format("\n".join(markdown)))
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
import numpy
|
||||
|
@ -20,7 +16,6 @@ from ..errors import Errors, Warnings
|
|||
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||
from ..lookups import Lookups
|
||||
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
except ImportError:
|
||||
|
@ -30,43 +25,21 @@ except ImportError:
|
|||
DEFAULT_OOV_PROB = -20
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_dir=("Model output directory", "positional", None, Path),
|
||||
freqs_loc=("Location of words frequencies file", "option", "f", Path),
|
||||
jsonl_loc=("Location of JSONL-formatted attributes file", "option", "j", Path),
|
||||
clusters_loc=("Optional location of brown clusters data", "option", "c", str),
|
||||
vectors_loc=("Optional vectors file in Word2Vec format", "option", "v", str),
|
||||
truncate_vectors=(
|
||||
"Optional number of vectors to truncate to when reading in vectors file",
|
||||
"option",
|
||||
"t",
|
||||
int,
|
||||
),
|
||||
prune_vectors=("Optional number of vectors to prune to", "option", "V", int),
|
||||
vectors_name=(
|
||||
"Optional name for the word vectors, e.g. en_core_web_lg.vectors",
|
||||
"option",
|
||||
"vn",
|
||||
str,
|
||||
),
|
||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
|
||||
)
|
||||
def init_model(
|
||||
lang,
|
||||
output_dir,
|
||||
freqs_loc=None,
|
||||
clusters_loc=None,
|
||||
jsonl_loc=None,
|
||||
vectors_loc=None,
|
||||
truncate_vectors=0,
|
||||
prune_vectors=-1,
|
||||
vectors_name=None,
|
||||
model_name=None,
|
||||
omit_extra_lookups=False,
|
||||
base_model=None,
|
||||
# fmt: off
|
||||
lang: ("Model language", "positional", None, str),
|
||||
output_dir: ("Model output directory", "positional", None, Path),
|
||||
freqs_loc: ("Location of words frequencies file", "option", "f", Path) = None,
|
||||
clusters_loc: ("Optional location of brown clusters data", "option", "c", str) = None,
|
||||
jsonl_loc: ("Location of JSONL-formatted attributes file", "option", "j", Path) = None,
|
||||
vectors_loc: ("Optional vectors file in Word2Vec format", "option", "v", str) = None,
|
||||
prune_vectors: ("Optional number of vectors to prune to", "option", "V", int) = -1,
|
||||
truncate_vectors: ("Optional number of vectors to truncate to when reading in vectors file", "option", "t", int) = 0,
|
||||
vectors_name: ("Optional name for the word vectors, e.g. en_core_web_lg.vectors", "option", "vn", str) = None,
|
||||
model_name: ("Optional name for the model meta", "option", "mn", str) = None,
|
||||
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
|
||||
base_model: ("Base model (for languages with custom tokenizers)", "option", "b", str) = None
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Create a new model from raw data, like word frequencies, Brown clusters
|
||||
|
@ -114,8 +87,7 @@ def init_model(
|
|||
vec_added = len(nlp.vocab.vectors)
|
||||
lex_added = len(nlp.vocab)
|
||||
msg.good(
|
||||
"Sucessfully compiled vocab",
|
||||
"{} entries, {} vectors".format(lex_added, vec_added),
|
||||
"Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
|
@ -203,9 +175,9 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
|||
nlp.vocab.vectors.add(lex.orth, row=lex.rank)
|
||||
else:
|
||||
if vectors_loc:
|
||||
with msg.loading("Reading vectors from {}".format(vectors_loc)):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc, truncate_vectors)
|
||||
msg.good("Loaded vectors from {}".format(vectors_loc))
|
||||
with msg.loading(f"Reading vectors from {vectors_loc}"):
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc)
|
||||
msg.good(f"Loaded vectors from {vectors_loc}")
|
||||
else:
|
||||
vectors_data, vector_keys = (None, None)
|
||||
if vector_keys is not None:
|
||||
|
@ -215,7 +187,7 @@ def add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, name=None):
|
|||
if vectors_data is not None:
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if name is None:
|
||||
nlp.vocab.vectors.name = "%s_model.vectors" % nlp.meta["lang"]
|
||||
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_model.vectors"
|
||||
else:
|
||||
nlp.vocab.vectors.name = name
|
||||
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
||||
|
@ -265,7 +237,7 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
word = literal_eval(key)
|
||||
except SyntaxError:
|
||||
# Take odd strings literally.
|
||||
word = literal_eval("'%s'" % key)
|
||||
word = literal_eval(f"'{key}'")
|
||||
smooth_count = counts.smoother(int(freq))
|
||||
probs[word] = math.log(smooth_count) - log_total
|
||||
oov_prob = math.log(counts.smoother(0)) - log_total
|
||||
|
|
|
@ -1,77 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import symlink_to, path2str
|
||||
from .. import util
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
origin=("package name or local path to model", "positional", None, str),
|
||||
link_name=("name of shortuct link to create", "positional", None, str),
|
||||
force=("force overwriting of existing link", "flag", "f", bool),
|
||||
)
|
||||
def link(origin, link_name, force=False, model_path=None):
|
||||
"""
|
||||
Create a symlink for models within the spacy/data directory. Accepts
|
||||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
if util.is_package(origin):
|
||||
model_path = util.get_package_path(origin)
|
||||
else:
|
||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||
if not model_path.exists():
|
||||
msg.fail(
|
||||
"Can't locate model data",
|
||||
"The data should be located in {}".format(path2str(model_path)),
|
||||
exits=1,
|
||||
)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
msg.fail(
|
||||
"Can't find the spaCy data path to create model symlink",
|
||||
"Make sure a directory `/data` exists within your spaCy "
|
||||
"installation and try again. The data directory should be located "
|
||||
"here:".format(path=spacy_loc),
|
||||
exits=1,
|
||||
)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.is_symlink() and not force:
|
||||
msg.fail(
|
||||
"Link '{}' already exists".format(link_name),
|
||||
"To overwrite an existing link, use the --force flag",
|
||||
exits=1,
|
||||
)
|
||||
elif link_path.is_symlink(): # does a symlink exist?
|
||||
# NB: It's important to check for is_symlink here and not for exists,
|
||||
# because invalid/outdated symlinks would return False otherwise.
|
||||
link_path.unlink()
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
# NB: Check this last because valid symlinks also "exist".
|
||||
msg.fail(
|
||||
"Can't overwrite symlink '{}'".format(link_name),
|
||||
"This can happen if your data directory contains a directory or "
|
||||
"file of the same name.",
|
||||
exits=1,
|
||||
)
|
||||
details = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except: # noqa: E722
|
||||
# This is quite dirty, but just making sure other errors are caught.
|
||||
msg.fail(
|
||||
"Couldn't link model to '{}'".format(link_name),
|
||||
"Creating a symlink in spacy/data failed. Make sure you have the "
|
||||
"required permissions and try re-running the command as admin, or "
|
||||
"use a virtualenv. You can still import the model as a module and "
|
||||
"call its load() method, or create the symlink manually.",
|
||||
)
|
||||
msg.text(details)
|
||||
raise
|
||||
msg.good("Linking successful", details)
|
||||
msg.text("You can now load the model via spacy.load('{}')".format(link_name))
|
|
@ -1,25 +1,21 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import msg, get_raw_input
|
||||
import srsly
|
||||
|
||||
from ..compat import path2str
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_dir=("Directory with model data", "positional", None, str),
|
||||
output_dir=("Output parent directory", "positional", None, str),
|
||||
meta_path=("Path to meta.json", "option", "m", str),
|
||||
create_meta=("Create meta.json, even if one exists", "flag", "c", bool),
|
||||
force=("Force overwriting existing model in output directory", "flag", "f", bool),
|
||||
)
|
||||
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
|
||||
def package(
|
||||
# fmt: off
|
||||
input_dir: ("Directory with model data", "positional", None, str),
|
||||
output_dir: ("Output parent directory", "positional", None, str),
|
||||
meta_path: ("Path to meta.json", "option", "m", str) = None,
|
||||
create_meta: ("Create meta.json, even if one exists", "flag", "c", bool) = False,
|
||||
force: ("Force overwriting existing model in output directory", "flag", "f", bool) = False,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Generate Python package for model data, including meta and required
|
||||
installation files. A new directory will be created in the specified
|
||||
|
@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
|||
for key in ("lang", "name", "version"):
|
||||
if key not in meta or meta[key] == "":
|
||||
msg.fail(
|
||||
"No '{}' setting found in meta.json".format(key),
|
||||
f"No '{key}' setting found in meta.json",
|
||||
"This setting is required to build your package.",
|
||||
exits=1,
|
||||
)
|
||||
|
@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
|
|||
|
||||
if package_path.exists():
|
||||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
shutil.rmtree(str(package_path))
|
||||
else:
|
||||
msg.fail(
|
||||
"Package directory already exists",
|
||||
"Please delete the directory and try again, or use the "
|
||||
"`--force` flag to overwrite existing "
|
||||
"directories.".format(path=path2str(package_path)),
|
||||
"`--force` flag to overwrite existing directories.",
|
||||
exits=1,
|
||||
)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
|
||||
shutil.copytree(str(input_path), str(package_path / model_name_v))
|
||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||
msg.good("Successfully created package '{}'".format(model_name_v), main_path)
|
||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
msg.text("To build the package, run `python setup.py sdist` in this directory.")
|
||||
|
||||
|
||||
|
@ -88,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("spacy_version", "Required spaCy version", ">=%s,<3.0.0" % about.__version__),
|
||||
("description", "Model description", meta.get("description", False)),
|
||||
("author", "Author", meta.get("author", False)),
|
||||
("email", "Author email", meta.get("email", False)),
|
||||
("url", "Author website", meta.get("url", False)),
|
||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||
("license", "License", meta.get("license", "MIT")),
|
||||
]
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
|
@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
|
||||
TEMPLATE_SETUP = """
|
||||
#!/usr/bin/env python
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import json
|
||||
from os import path, walk
|
||||
|
@ -176,6 +168,7 @@ def setup_package():
|
|||
package_data={model_name: list_files(model_dir)},
|
||||
install_requires=list_requirements(meta),
|
||||
zip_safe=False,
|
||||
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
|
||||
)
|
||||
|
||||
|
||||
|
@ -190,9 +183,6 @@ include meta.json
|
|||
|
||||
|
||||
TEMPLATE_INIT = """
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pathlib import Path
|
||||
from spacy.util import load_model_from_init_py, get_model_meta
|
||||
|
||||
|
|
|
@ -1,107 +1,41 @@
|
|||
# coding: utf8
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import plac
|
||||
import random
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from collections import Counter
|
||||
import plac
|
||||
from pathlib import Path
|
||||
from thinc.v2v import Affine, Maxout
|
||||
from thinc.misc import LayerNorm as LN
|
||||
from thinc.neural.util import prefer_gpu
|
||||
from thinc.api import Linear, Maxout, chain, list2array, use_pytorch_for_gpu_memory
|
||||
from wasabi import msg
|
||||
import srsly
|
||||
|
||||
from ..errors import Errors
|
||||
from ..ml.models.multi_task import build_masked_language_model
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID, HEAD
|
||||
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
|
||||
from .._ml import masked_language_model, get_cossim_loss
|
||||
from .. import util
|
||||
from .train import _load_pretrained_tok2vec
|
||||
from ..gold import Example
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
texts_loc=(
|
||||
"Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the "
|
||||
"key 'tokens'",
|
||||
"positional",
|
||||
None,
|
||||
str,
|
||||
),
|
||||
vectors_model=("Name or path to spaCy model with vectors to learn from"),
|
||||
output_dir=("Directory to write models to on each epoch", "positional", None, str),
|
||||
width=("Width of CNN layers", "option", "cw", int),
|
||||
conv_depth=("Depth of CNN layers", "option", "cd", int),
|
||||
cnn_window=("Window size for CNN layers", "option", "cW", int),
|
||||
cnn_pieces=("Maxout size for CNN layers. 1 for Mish", "option", "cP", int),
|
||||
use_chars=("Whether to use character-based embedding", "flag", "chr", bool),
|
||||
sa_depth=("Depth of self-attention layers", "option", "sa", int),
|
||||
bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
|
||||
embed_rows=("Number of embedding rows", "option", "er", int),
|
||||
loss_func=(
|
||||
"Loss function to use for the objective. Either 'L2' or 'cosine'",
|
||||
"option",
|
||||
"L",
|
||||
str,
|
||||
),
|
||||
use_vectors=("Whether to use the static vectors as input features", "flag", "uv"),
|
||||
dropout=("Dropout rate", "option", "d", float),
|
||||
batch_size=("Number of words per training batch", "option", "bs", int),
|
||||
max_length=(
|
||||
"Max words per example. Longer examples are discarded",
|
||||
"option",
|
||||
"xw",
|
||||
int,
|
||||
),
|
||||
min_length=(
|
||||
"Min words per example. Shorter examples are discarded",
|
||||
"option",
|
||||
"nw",
|
||||
int,
|
||||
),
|
||||
seed=("Seed for random number generators", "option", "s", int),
|
||||
n_iter=("Number of iterations to pretrain", "option", "i", int),
|
||||
n_save_every=("Save model every X batches.", "option", "se", int),
|
||||
init_tok2vec=(
|
||||
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.",
|
||||
"option",
|
||||
"t2v",
|
||||
Path,
|
||||
),
|
||||
epoch_start=(
|
||||
"The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
|
||||
"renamed. Prevents unintended overwriting of existing weight files.",
|
||||
"option",
|
||||
"es",
|
||||
int,
|
||||
),
|
||||
# fmt: off
|
||||
texts_loc=("Path to JSONL file with raw texts to learn from, with text provided as the key 'text' or tokens as the key 'tokens'", "positional", None, str),
|
||||
vectors_model=("Name or path to spaCy model with vectors to learn from", "positional", None, str),
|
||||
output_dir=("Directory to write models to on each epoch", "positional", None, Path),
|
||||
config_path=("Path to config file", "positional", None, Path),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path),
|
||||
epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int),
|
||||
# fmt: on
|
||||
)
|
||||
def pretrain(
|
||||
texts_loc,
|
||||
vectors_model,
|
||||
config_path,
|
||||
output_dir,
|
||||
width=96,
|
||||
conv_depth=4,
|
||||
bilstm_depth=0,
|
||||
cnn_pieces=3,
|
||||
sa_depth=0,
|
||||
use_chars=False,
|
||||
cnn_window=1,
|
||||
embed_rows=2000,
|
||||
loss_func="cosine",
|
||||
use_vectors=False,
|
||||
dropout=0.2,
|
||||
n_iter=1000,
|
||||
batch_size=3000,
|
||||
max_length=500,
|
||||
min_length=5,
|
||||
seed=0,
|
||||
n_save_every=None,
|
||||
init_tok2vec=None,
|
||||
epoch_start=None,
|
||||
use_gpu=-1,
|
||||
resume_path=None,
|
||||
epoch_resume=None,
|
||||
):
|
||||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
|
@ -115,34 +49,46 @@ def pretrain(
|
|||
However, it's still quite experimental, so your mileage may vary.
|
||||
|
||||
To load the weights back in during 'spacy train', you need to ensure
|
||||
all settings are the same between pretraining and training. The API and
|
||||
errors around this need some improvement.
|
||||
all settings are the same between pretraining and training. Ideally,
|
||||
this is done by using the same config file for both commands.
|
||||
"""
|
||||
config = dict(locals())
|
||||
for key in config:
|
||||
if isinstance(config[key], Path):
|
||||
config[key] = str(config[key])
|
||||
util.fix_random_seed(seed)
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
|
||||
has_gpu = prefer_gpu()
|
||||
if has_gpu:
|
||||
import torch
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU")
|
||||
util.use_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||
msg.info("Using GPU" if has_gpu else "Not using GPU")
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
config = util.load_config(config_path, create_objects=False)
|
||||
util.fix_random_seed(config["pretraining"]["seed"])
|
||||
if config["pretraining"]["use_pytorch_for_gpu_memory"]:
|
||||
use_pytorch_for_gpu_memory()
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
if output_dir.exists() and [p for p in output_dir.iterdir()]:
|
||||
msg.warn(
|
||||
"Output directory is not empty",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if resume_path:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"If you're resuming a run from a previous model in this directory, "
|
||||
"the old models for the consecutive epochs will be overwritten "
|
||||
"with the new ones.",
|
||||
)
|
||||
else:
|
||||
msg.warn(
|
||||
"Output directory is not empty. ",
|
||||
"It is better to use an empty directory or refer to a new output path, "
|
||||
"then the new directory will be created for you.",
|
||||
)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
msg.good("Created output directory: {}".format(output_dir))
|
||||
msg.good(f"Created output directory: {output_dir}")
|
||||
srsly.write_json(output_dir / "config.json", config)
|
||||
msg.good("Saved settings to config.json")
|
||||
msg.good("Saved config file in the output directory")
|
||||
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
pretrain_config = config["pretraining"]
|
||||
|
||||
# Load texts from file or stdin
|
||||
if texts_loc != "-": # reading from a file
|
||||
|
@ -156,64 +102,58 @@ def pretrain(
|
|||
msg.good("Loaded input texts")
|
||||
random.shuffle(texts)
|
||||
else: # reading from stdin
|
||||
msg.text("Reading input text from stdin...")
|
||||
msg.info("Reading input text from stdin...")
|
||||
texts = srsly.read_jsonl("-")
|
||||
|
||||
with msg.loading("Loading model '{}'...".format(vectors_model)):
|
||||
with msg.loading(f"Loading model '{vectors_model}'..."):
|
||||
nlp = util.load_model(vectors_model)
|
||||
msg.good("Loaded model '{}'".format(vectors_model))
|
||||
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
|
||||
model = create_pretraining_model(
|
||||
nlp,
|
||||
Tok2Vec(
|
||||
width,
|
||||
embed_rows,
|
||||
conv_depth=conv_depth,
|
||||
pretrained_vectors=pretrained_vectors,
|
||||
bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental.
|
||||
subword_features=not use_chars, # Set to False for Chinese etc
|
||||
cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation.
|
||||
),
|
||||
)
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
msg.good(f"Loaded model '{vectors_model}'")
|
||||
tok2vec_path = pretrain_config["tok2vec_model"]
|
||||
tok2vec = config
|
||||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
model = create_pretraining_model(nlp, tok2vec)
|
||||
optimizer = pretrain_config["optimizer"]
|
||||
|
||||
# Load in pretrained weights to resume from
|
||||
if resume_path is not None:
|
||||
msg.info(f"Resume training tok2vec from: {resume_path}")
|
||||
with resume_path.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
model.get_ref("tok2vec").from_bytes(weights_data)
|
||||
# Parse the epoch number from the given weight file
|
||||
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
||||
model_name = re.search(r"model\d+\.bin", str(resume_path))
|
||||
if model_name:
|
||||
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||
epoch_start = int(model_name.group(0)[5:][:-4]) + 1
|
||||
epoch_resume = int(model_name.group(0)[5:][:-4]) + 1
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
if not epoch_start:
|
||||
if not epoch_resume:
|
||||
msg.fail(
|
||||
"You have to use the '--epoch-start' argument when using a renamed weight file for "
|
||||
"'--init-tok2vec'",
|
||||
"You have to use the --epoch-resume setting when using a renamed weight file for --resume-path",
|
||||
exits=True,
|
||||
)
|
||||
elif epoch_start < 0:
|
||||
elif epoch_resume < 0:
|
||||
msg.fail(
|
||||
"The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid"
|
||||
% epoch_start,
|
||||
f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid",
|
||||
exits=True,
|
||||
)
|
||||
else:
|
||||
msg.info(f"Resuming from epoch: {epoch_resume}")
|
||||
else:
|
||||
# Without '--init-tok2vec' the '--epoch-start' argument is ignored
|
||||
epoch_start = 0
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
|
||||
optimizer = create_default_optimizer(model.ops)
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||
|
||||
def _save_model(epoch, is_temp=False):
|
||||
is_temp_str = ".temp" if is_temp else ""
|
||||
with model.use_params(optimizer.averages):
|
||||
with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
|
||||
"wb"
|
||||
) as file_:
|
||||
file_.write(model.tok2vec.to_bytes())
|
||||
with (output_dir / f"model{epoch}{is_temp_str}.bin").open("wb") as file_:
|
||||
file_.write(model.get_ref("tok2vec").to_bytes())
|
||||
log = {
|
||||
"nr_word": tracker.nr_word,
|
||||
"loss": tracker.loss,
|
||||
|
@ -224,26 +164,27 @@ def pretrain(
|
|||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
skip_counter = 0
|
||||
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||
for batch_id, batch in enumerate(
|
||||
util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
|
||||
):
|
||||
loss_func = pretrain_config["loss_func"]
|
||||
for epoch in range(epoch_resume, pretrain_config["max_epochs"]):
|
||||
examples = [Example(doc=text) for text in texts]
|
||||
batches = util.minibatch_by_words(examples, size=pretrain_config["batch_size"])
|
||||
for batch_id, batch in enumerate(batches):
|
||||
docs, count = make_docs(
|
||||
nlp,
|
||||
[text for (text, _) in batch],
|
||||
max_length=max_length,
|
||||
min_length=min_length,
|
||||
[ex.doc for ex in batch],
|
||||
max_length=pretrain_config["max_length"],
|
||||
min_length=pretrain_config["min_length"],
|
||||
)
|
||||
skip_counter += count
|
||||
loss = make_update(
|
||||
model, docs, optimizer, objective=loss_func, drop=dropout
|
||||
)
|
||||
loss = make_update(model, docs, optimizer, distance=loss_func)
|
||||
progress = tracker.update(epoch, loss, docs)
|
||||
if progress:
|
||||
msg.row(progress, **row_settings)
|
||||
if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
|
||||
break
|
||||
if n_save_every and (batch_id % n_save_every == 0):
|
||||
if pretrain_config["n_save_every"] and (
|
||||
batch_id % pretrain_config["n_save_every"] == 0
|
||||
):
|
||||
_save_model(epoch, is_temp=True)
|
||||
_save_model(epoch)
|
||||
tracker.epoch_loss = 0.0
|
||||
|
@ -251,21 +192,21 @@ def pretrain(
|
|||
# Reshuffle the texts if texts were loaded from a file
|
||||
random.shuffle(texts)
|
||||
if skip_counter > 0:
|
||||
msg.warn("Skipped {count} empty values".format(count=str(skip_counter)))
|
||||
msg.warn(f"Skipped {skip_counter} empty values")
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
||||
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
|
||||
def make_update(model, docs, optimizer, distance):
|
||||
"""Perform an update over a single batch of documents.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
drop (float): The dropout rate.
|
||||
optimizer (callable): An optimizer.
|
||||
RETURNS loss: A float for the loss.
|
||||
"""
|
||||
predictions, backprop = model.begin_update(docs, drop=drop)
|
||||
loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
|
||||
backprop(gradients, sgd=optimizer)
|
||||
predictions, backprop = model.begin_update(docs)
|
||||
loss, gradients = get_vectors_loss(model.ops, docs, predictions, distance)
|
||||
backprop(gradients)
|
||||
model.finish_update(optimizer)
|
||||
# Don't want to return a cupy object here
|
||||
# The gradients are modified in-place by the BERT MLM,
|
||||
# so we get an accurate loss
|
||||
|
@ -297,12 +238,12 @@ def make_docs(nlp, batch, min_length, max_length):
|
|||
heads = numpy.asarray(heads, dtype="uint64")
|
||||
heads = heads.reshape((len(doc), 1))
|
||||
doc = doc.from_array([HEAD], heads)
|
||||
if len(doc) >= min_length and len(doc) < max_length:
|
||||
if min_length <= len(doc) < max_length:
|
||||
docs.append(doc)
|
||||
return docs, skip_count
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a mean-squared error loss between the documents' vectors and
|
||||
the prediction.
|
||||
|
||||
|
@ -316,13 +257,7 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
|
|||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
if objective == "L2":
|
||||
d_target = prediction - target
|
||||
loss = (d_target ** 2).sum()
|
||||
elif objective == "cosine":
|
||||
loss, d_target = get_cossim_loss(prediction, target)
|
||||
else:
|
||||
raise ValueError(Errors.E142.format(loss_func=objective))
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
|
@ -331,22 +266,21 @@ def create_pretraining_model(nlp, tok2vec):
|
|||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
takes a batch of Doc objects (as a list), and returns a list of arrays.
|
||||
Each array in the output needs to have one row per token in the doc.
|
||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
output_size = nlp.vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
|
||||
Maxout(nO=300, nP=3, normalize=True, dropout=0.0), Linear(output_size)
|
||||
)
|
||||
# This is annoying, but the parser etc have the flatten step after
|
||||
# the tok2vec. To load the weights in cleanly, we need to match
|
||||
# the shape of the models' components exactly. So what we cann
|
||||
# "tok2vec" has to be the same set of processes as what the components do.
|
||||
tok2vec = chain(tok2vec, flatten)
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = masked_language_model(nlp.vocab, model)
|
||||
model.tok2vec = tok2vec
|
||||
model.output_layer = output_layer
|
||||
model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
|
||||
return model
|
||||
model = chain(tok2vec, list2array())
|
||||
model = chain(model, output_layer)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
mlm_model = build_masked_language_model(nlp.vocab, model)
|
||||
mlm_model.set_ref("tok2vec", tok2vec)
|
||||
mlm_model.set_ref("output_layer", output_layer)
|
||||
mlm_model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
return mlm_model
|
||||
|
||||
|
||||
class ProgressTracker(object):
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
@ -9,18 +5,19 @@ import cProfile
|
|||
import pstats
|
||||
import sys
|
||||
import itertools
|
||||
import thinc.extra.datasets
|
||||
import ml_datasets
|
||||
from wasabi import msg
|
||||
|
||||
from ..util import load_model
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model to load", "positional", None, str),
|
||||
inputs=("Location of input file. '-' for stdin.", "positional", None, str),
|
||||
n_texts=("Maximum number of texts to use if available", "option", "n", int),
|
||||
)
|
||||
def profile(model, inputs=None, n_texts=10000):
|
||||
def profile(
|
||||
# fmt: off
|
||||
model: ("Model to load", "positional", None, str),
|
||||
inputs: ("Location of input file. '-' for stdin.", "positional", None, str) = None,
|
||||
n_texts: ("Maximum number of texts to use if available", "option", "n", int) = 10000,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Profile a spaCy pipeline, to find out which functions take the most time.
|
||||
Input should be formatted as one JSON object per line with a key "text".
|
||||
|
@ -32,13 +29,13 @@ def profile(model, inputs=None, n_texts=10000):
|
|||
if inputs is None:
|
||||
n_inputs = 25000
|
||||
with msg.loading("Loading IMDB dataset via Thinc..."):
|
||||
imdb_train, _ = thinc.extra.datasets.imdb()
|
||||
imdb_train, _ = ml_datasets.imdb()
|
||||
inputs, _ = zip(*imdb_train)
|
||||
msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
|
||||
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
||||
inputs = inputs[:n_inputs]
|
||||
with msg.loading("Loading model '{}'...".format(model)):
|
||||
with msg.loading(f"Loading model '{model}'..."):
|
||||
nlp = load_model(model)
|
||||
msg.good("Loaded model '{}'".format(model))
|
||||
msg.good(f"Loaded model '{model}'")
|
||||
texts = list(itertools.islice(inputs, n_texts))
|
||||
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
||||
s = pstats.Stats("Profile.prof")
|
||||
|
@ -60,7 +57,7 @@ def _read_inputs(loc, msg):
|
|||
input_path = Path(loc)
|
||||
if not input_path.exists() or not input_path.is_file():
|
||||
msg.fail("Not a valid input data file", loc, exits=1)
|
||||
msg.info("Using data from {}".format(input_path.parts[-1]))
|
||||
msg.info(f"Using data from {input_path.parts[-1]}")
|
||||
file_ = input_path.open()
|
||||
for line in file_:
|
||||
data = srsly.json_loads(line)
|
||||
|
|
|
@ -1,770 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, division, print_function
|
||||
|
||||
import plac
|
||||
import os
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
import shutil
|
||||
import srsly
|
||||
from wasabi import msg
|
||||
import contextlib
|
||||
import random
|
||||
|
||||
from .._ml import create_default_optimizer
|
||||
from ..util import use_gpu as set_gpu
|
||||
from ..errors import Errors
|
||||
from ..gold import GoldCorpus
|
||||
from ..compat import path2str
|
||||
from ..lookups import Lookups
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
# fmt: off
|
||||
lang=("Model language", "positional", None, str),
|
||||
output_path=("Output directory to store model in", "positional", None, Path),
|
||||
train_path=("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path=("Location of JSON-formatted development data", "positional", None, Path),
|
||||
raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
|
||||
base_model=("Name of model to update (optional)", "option", "b", str),
|
||||
pipeline=("Comma-separated names of pipeline components", "option", "p", str),
|
||||
replace_components=("Replace components from base model", "flag", "R", bool),
|
||||
vectors=("Model to load vectors from", "option", "v", str),
|
||||
width=("Width of CNN layers of Tok2Vec component", "option", "cw", int),
|
||||
conv_depth=("Depth of CNN layers of Tok2Vec component", "option", "cd", int),
|
||||
cnn_window=("Window size for CNN layers of Tok2Vec component", "option", "cW", int),
|
||||
cnn_pieces=("Maxout size for CNN layers of Tok2Vec component. 1 for Mish", "option", "cP", int),
|
||||
use_chars=("Whether to use character-based embedding of Tok2Vec component", "flag", "chr", bool),
|
||||
bilstm_depth=("Depth of BiLSTM layers of Tok2Vec component (requires PyTorch)", "option", "lstm", int),
|
||||
embed_rows=("Number of embedding rows of Tok2Vec component", "option", "er", int),
|
||||
n_iter=("Number of iterations", "option", "n", int),
|
||||
n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
|
||||
n_examples=("Number of examples", "option", "ns", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
version=("Model version", "option", "V", str),
|
||||
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
|
||||
init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
|
||||
parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
|
||||
entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
|
||||
noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
|
||||
orth_variant_level=("Amount of orthography variation for data augmentation", "option", "ovl", float),
|
||||
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
|
||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
|
||||
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
|
||||
textcat_arch=("Textcat model architecture", "option", "ta", str),
|
||||
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
|
||||
tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path),
|
||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||
verbose=("Display more information for debug", "flag", "VV", bool),
|
||||
debug=("Run data diagnostics before training", "flag", "D", bool),
|
||||
# fmt: on
|
||||
)
|
||||
def train(
|
||||
lang,
|
||||
output_path,
|
||||
train_path,
|
||||
dev_path,
|
||||
raw_text=None,
|
||||
base_model=None,
|
||||
pipeline="tagger,parser,ner",
|
||||
replace_components=False,
|
||||
vectors=None,
|
||||
width=96,
|
||||
conv_depth=4,
|
||||
cnn_window=1,
|
||||
cnn_pieces=3,
|
||||
use_chars=False,
|
||||
bilstm_depth=0,
|
||||
embed_rows=2000,
|
||||
n_iter=30,
|
||||
n_early_stopping=None,
|
||||
n_examples=0,
|
||||
use_gpu=-1,
|
||||
version="0.0.0",
|
||||
meta_path=None,
|
||||
init_tok2vec=None,
|
||||
parser_multitasks="",
|
||||
entity_multitasks="",
|
||||
noise_level=0.0,
|
||||
orth_variant_level=0.0,
|
||||
eval_beam_widths="",
|
||||
gold_preproc=False,
|
||||
learn_tokens=False,
|
||||
textcat_multilabel=False,
|
||||
textcat_arch="bow",
|
||||
textcat_positive_label=None,
|
||||
tag_map_path=None,
|
||||
omit_extra_lookups=False,
|
||||
verbose=False,
|
||||
debug=False,
|
||||
):
|
||||
"""
|
||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||
JSON format. To convert data from other formats, use the `spacy convert`
|
||||
command.
|
||||
"""
|
||||
util.fix_random_seed()
|
||||
util.set_env_log(verbose)
|
||||
|
||||
# Make sure all files and paths exists if they are needed
|
||||
train_path = util.ensure_path(train_path)
|
||||
dev_path = util.ensure_path(dev_path)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
output_path = util.ensure_path(output_path)
|
||||
if raw_text is not None:
|
||||
raw_text = list(srsly.read_jsonl(raw_text))
|
||||
if not train_path or not train_path.exists():
|
||||
msg.fail("Training data not found", train_path, exits=1)
|
||||
if not dev_path or not dev_path.exists():
|
||||
msg.fail("Development data not found", dev_path, exits=1)
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
msg.fail("Can't find model meta.json", meta_path, exits=1)
|
||||
meta = srsly.read_json(meta_path) if meta_path else {}
|
||||
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||
msg.warn(
|
||||
"Output directory is not empty",
|
||||
"This can lead to unintended side effects when saving the model. "
|
||||
"Please use an empty directory or a different path instead. If "
|
||||
"the specified output path doesn't exist, the directory will be "
|
||||
"created for you.",
|
||||
)
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
msg.good("Created output directory: {}".format(output_path))
|
||||
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
tag_map = srsly.read_json(tag_map_path)
|
||||
# Take dropout and batch size as generators of values -- dropout
|
||||
# starts high and decays sharply, to force the optimizer to explore.
|
||||
# Batch size starts at 1 and grows, so that we make updates quickly
|
||||
# at the beginning of training.
|
||||
dropout_rates = util.decaying(
|
||||
util.env_opt("dropout_from", 0.2),
|
||||
util.env_opt("dropout_to", 0.2),
|
||||
util.env_opt("dropout_decay", 0.0),
|
||||
)
|
||||
batch_sizes = util.compounding(
|
||||
util.env_opt("batch_from", 100.0),
|
||||
util.env_opt("batch_to", 1000.0),
|
||||
util.env_opt("batch_compound", 1.001),
|
||||
)
|
||||
|
||||
if not eval_beam_widths:
|
||||
eval_beam_widths = [1]
|
||||
else:
|
||||
eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
|
||||
if 1 not in eval_beam_widths:
|
||||
eval_beam_widths.append(1)
|
||||
eval_beam_widths.sort()
|
||||
has_beam_widths = eval_beam_widths != [1]
|
||||
|
||||
# Set up the base model and pipeline. If a base model is specified, load
|
||||
# the model and make sure the pipeline matches the pipeline setting. If
|
||||
# training starts from a blank model, intitalize the language class.
|
||||
pipeline = [p.strip() for p in pipeline.split(",")]
|
||||
disabled_pipes = None
|
||||
pipes_added = False
|
||||
msg.text("Training pipeline: {}".format(pipeline))
|
||||
if use_gpu >= 0:
|
||||
activated_gpu = None
|
||||
try:
|
||||
activated_gpu = set_gpu(use_gpu)
|
||||
except Exception as e:
|
||||
msg.warn("Exception: {}".format(e))
|
||||
if activated_gpu is not None:
|
||||
msg.text("Using GPU: {}".format(use_gpu))
|
||||
else:
|
||||
msg.warn("Unable to activate GPU: {}".format(use_gpu))
|
||||
msg.text("Using CPU only")
|
||||
use_gpu = -1
|
||||
base_components = []
|
||||
if base_model:
|
||||
msg.text("Starting with base model '{}'".format(base_model))
|
||||
nlp = util.load_model(base_model)
|
||||
if nlp.lang != lang:
|
||||
msg.fail(
|
||||
"Model language ('{}') doesn't match language specified as "
|
||||
"`lang` argument ('{}') ".format(nlp.lang, lang),
|
||||
exits=1,
|
||||
)
|
||||
for pipe in pipeline:
|
||||
pipe_cfg = {}
|
||||
if pipe == "parser":
|
||||
pipe_cfg = {"learn_tokens": learn_tokens}
|
||||
elif pipe == "textcat":
|
||||
pipe_cfg = {
|
||||
"exclusive_classes": not textcat_multilabel,
|
||||
"architecture": textcat_arch,
|
||||
"positive_label": textcat_positive_label,
|
||||
}
|
||||
if pipe not in nlp.pipe_names:
|
||||
msg.text("Adding component to base model '{}'".format(pipe))
|
||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||
pipes_added = True
|
||||
elif replace_components:
|
||||
msg.text("Replacing component from base model '{}'".format(pipe))
|
||||
nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
|
||||
pipes_added = True
|
||||
else:
|
||||
if pipe == "textcat":
|
||||
textcat_cfg = nlp.get_pipe("textcat").cfg
|
||||
base_cfg = {
|
||||
"exclusive_classes": textcat_cfg["exclusive_classes"],
|
||||
"architecture": textcat_cfg["architecture"],
|
||||
"positive_label": textcat_cfg["positive_label"],
|
||||
}
|
||||
if base_cfg != pipe_cfg:
|
||||
msg.fail(
|
||||
"The base textcat model configuration does"
|
||||
"not match the provided training options. "
|
||||
"Existing cfg: {}, provided cfg: {}".format(
|
||||
base_cfg, pipe_cfg
|
||||
),
|
||||
exits=1,
|
||||
)
|
||||
msg.text("Extending component from base model '{}'".format(pipe))
|
||||
base_components.append(pipe)
|
||||
disabled_pipes = nlp.disable_pipes(
|
||||
[p for p in nlp.pipe_names if p not in pipeline]
|
||||
)
|
||||
else:
|
||||
msg.text("Starting with blank model '{}'".format(lang))
|
||||
lang_cls = util.get_lang_class(lang)
|
||||
nlp = lang_cls()
|
||||
for pipe in pipeline:
|
||||
if pipe == "parser":
|
||||
pipe_cfg = {"learn_tokens": learn_tokens}
|
||||
elif pipe == "textcat":
|
||||
pipe_cfg = {
|
||||
"exclusive_classes": not textcat_multilabel,
|
||||
"architecture": textcat_arch,
|
||||
"positive_label": textcat_positive_label,
|
||||
}
|
||||
else:
|
||||
pipe_cfg = {}
|
||||
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
|
||||
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
|
||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||
# isn't loaded if these features are accessed
|
||||
if omit_extra_lookups:
|
||||
nlp.vocab.lookups_extra = Lookups()
|
||||
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
||||
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
||||
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
||||
|
||||
if vectors:
|
||||
msg.text("Loading vector from model '{}'".format(vectors))
|
||||
_load_vectors(nlp, vectors)
|
||||
|
||||
# Multitask objectives
|
||||
multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
|
||||
for pipe_name, multitasks in multitask_options:
|
||||
if multitasks:
|
||||
if pipe_name not in pipeline:
|
||||
msg.fail(
|
||||
"Can't use multitask objective without '{}' in the "
|
||||
"pipeline".format(pipe_name)
|
||||
)
|
||||
pipe = nlp.get_pipe(pipe_name)
|
||||
for objective in multitasks.split(","):
|
||||
pipe.add_multitask_objective(objective)
|
||||
|
||||
# Prepare training corpus
|
||||
msg.text("Counting training words (limit={})".format(n_examples))
|
||||
corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
|
||||
n_train_words = corpus.count_train()
|
||||
|
||||
if base_model and not pipes_added:
|
||||
# Start with an existing model, use default optimizer
|
||||
optimizer = create_default_optimizer(Model.ops)
|
||||
else:
|
||||
# Start with a blank model, call begin_training
|
||||
cfg = {"device": use_gpu}
|
||||
cfg["conv_depth"] = conv_depth
|
||||
cfg["token_vector_width"] = width
|
||||
cfg["bilstm_depth"] = bilstm_depth
|
||||
cfg["cnn_maxout_pieces"] = cnn_pieces
|
||||
cfg["embed_size"] = embed_rows
|
||||
cfg["conv_window"] = cnn_window
|
||||
cfg["subword_features"] = not use_chars
|
||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)
|
||||
|
||||
nlp._optimizer = None
|
||||
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
|
||||
# Verify textcat config
|
||||
if "textcat" in pipeline:
|
||||
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
||||
if textcat_positive_label and textcat_positive_label not in textcat_labels:
|
||||
msg.fail(
|
||||
"The textcat_positive_label (tpl) '{}' does not match any "
|
||||
"label in the training data.".format(textcat_positive_label),
|
||||
exits=1,
|
||||
)
|
||||
if textcat_positive_label and len(textcat_labels) != 2:
|
||||
msg.fail(
|
||||
"A textcat_positive_label (tpl) '{}' was provided for training "
|
||||
"data that does not appear to be a binary classification "
|
||||
"problem with two labels.".format(textcat_positive_label),
|
||||
exits=1,
|
||||
)
|
||||
train_docs = corpus.train_docs(
|
||||
nlp,
|
||||
noise_level=noise_level,
|
||||
gold_preproc=gold_preproc,
|
||||
max_length=0,
|
||||
ignore_misaligned=True,
|
||||
)
|
||||
train_labels = set()
|
||||
if textcat_multilabel:
|
||||
multilabel_found = False
|
||||
for text, gold in train_docs:
|
||||
train_labels.update(gold.cats.keys())
|
||||
if list(gold.cats.values()).count(1.0) != 1:
|
||||
multilabel_found = True
|
||||
if not multilabel_found and not base_model:
|
||||
msg.warn(
|
||||
"The textcat training instances look like they have "
|
||||
"mutually-exclusive classes. Remove the flag "
|
||||
"'--textcat-multilabel' to train a classifier with "
|
||||
"mutually-exclusive classes."
|
||||
)
|
||||
if not textcat_multilabel:
|
||||
for text, gold in train_docs:
|
||||
train_labels.update(gold.cats.keys())
|
||||
if list(gold.cats.values()).count(1.0) != 1 and not base_model:
|
||||
msg.warn(
|
||||
"Some textcat training instances do not have exactly "
|
||||
"one positive label. Modifying training options to "
|
||||
"include the flag '--textcat-multilabel' for classes "
|
||||
"that are not mutually exclusive."
|
||||
)
|
||||
nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
|
||||
textcat_multilabel = True
|
||||
break
|
||||
if base_model and set(textcat_labels) != train_labels:
|
||||
msg.fail(
|
||||
"Cannot extend textcat model using data with different "
|
||||
"labels. Base model labels: {}, training data labels: "
|
||||
"{}.".format(textcat_labels, list(train_labels)),
|
||||
exits=1,
|
||||
)
|
||||
if textcat_multilabel:
|
||||
msg.text(
|
||||
"Textcat evaluation score: ROC AUC score macro-averaged across "
|
||||
"the labels '{}'".format(", ".join(textcat_labels))
|
||||
)
|
||||
elif textcat_positive_label and len(textcat_labels) == 2:
|
||||
msg.text(
|
||||
"Textcat evaluation score: F1-score for the "
|
||||
"label '{}'".format(textcat_positive_label)
|
||||
)
|
||||
elif len(textcat_labels) > 1:
|
||||
if len(textcat_labels) == 2:
|
||||
msg.warn(
|
||||
"If the textcat component is a binary classifier with "
|
||||
"exclusive classes, provide '--textcat-positive-label' for "
|
||||
"an evaluation on the positive class."
|
||||
)
|
||||
msg.text(
|
||||
"Textcat evaluation score: F1-score macro-averaged across "
|
||||
"the labels '{}'".format(", ".join(textcat_labels))
|
||||
)
|
||||
else:
|
||||
msg.fail(
|
||||
"Unsupported textcat configuration. Use `spacy debug-data` "
|
||||
"for more information."
|
||||
)
|
||||
|
||||
# fmt: off
|
||||
row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
|
||||
row_widths = [len(w) for w in row_head]
|
||||
row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
|
||||
# fmt: on
|
||||
print("")
|
||||
msg.row(row_head, **row_settings)
|
||||
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
|
||||
try:
|
||||
iter_since_best = 0
|
||||
best_score = 0.0
|
||||
for i in range(n_iter):
|
||||
train_docs = corpus.train_docs(
|
||||
nlp,
|
||||
noise_level=noise_level,
|
||||
orth_variant_level=orth_variant_level,
|
||||
gold_preproc=gold_preproc,
|
||||
max_length=0,
|
||||
ignore_misaligned=True,
|
||||
)
|
||||
if raw_text:
|
||||
random.shuffle(raw_text)
|
||||
raw_batches = util.minibatch(
|
||||
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
|
||||
)
|
||||
words_seen = 0
|
||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||
losses = {}
|
||||
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
||||
if not batch:
|
||||
continue
|
||||
docs, golds = zip(*batch)
|
||||
try:
|
||||
nlp.update(
|
||||
docs,
|
||||
golds,
|
||||
sgd=optimizer,
|
||||
drop=next(dropout_rates),
|
||||
losses=losses,
|
||||
)
|
||||
except ValueError as e:
|
||||
err = "Error during training"
|
||||
if init_tok2vec:
|
||||
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
|
||||
msg.fail(err, "Original error message: {}".format(e), exits=1)
|
||||
if raw_text:
|
||||
# If raw text is available, perform 'rehearsal' updates,
|
||||
# which use unlabelled data to reduce overfitting.
|
||||
raw_batch = list(next(raw_batches))
|
||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
||||
if not int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||
pbar.update(sum(len(doc) for doc in docs))
|
||||
words_seen += sum(len(doc) for doc in docs)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
util.set_env_log(False)
|
||||
epoch_model_path = output_path / ("model%d" % i)
|
||||
nlp.to_disk(epoch_model_path)
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
for beam_width in eval_beam_widths:
|
||||
for name, component in nlp_loaded.pipeline:
|
||||
if hasattr(component, "cfg"):
|
||||
component.cfg["beam_width"] = beam_width
|
||||
dev_docs = list(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc,
|
||||
ignore_misaligned=True,
|
||||
)
|
||||
)
|
||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
|
||||
end_time = timer()
|
||||
if use_gpu < 0:
|
||||
gpu_wps = None
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
else:
|
||||
gpu_wps = nwords / (end_time - start_time)
|
||||
# Only evaluate on CPU in the first iteration (for
|
||||
# timing) if GPU is enabled
|
||||
if i == 0:
|
||||
with Model.use_device("cpu"):
|
||||
nlp_loaded = util.load_model_from_path(epoch_model_path)
|
||||
for name, component in nlp_loaded.pipeline:
|
||||
if hasattr(component, "cfg"):
|
||||
component.cfg["beam_width"] = beam_width
|
||||
dev_docs = list(
|
||||
corpus.dev_docs(
|
||||
nlp_loaded,
|
||||
gold_preproc=gold_preproc,
|
||||
ignore_misaligned=True,
|
||||
)
|
||||
)
|
||||
start_time = timer()
|
||||
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
|
||||
end_time = timer()
|
||||
cpu_wps = nwords / (end_time - start_time)
|
||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||
srsly.write_json(acc_loc, scorer.scores)
|
||||
|
||||
# Update model meta.json
|
||||
meta["lang"] = nlp.lang
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["spacy_version"] = ">=%s" % about.__version__
|
||||
if beam_width == 1:
|
||||
meta["speed"] = {
|
||||
"nwords": nwords,
|
||||
"cpu": cpu_wps,
|
||||
"gpu": gpu_wps,
|
||||
}
|
||||
meta.setdefault("accuracy", {})
|
||||
for component in nlp.pipe_names:
|
||||
for metric in _get_metrics(component):
|
||||
meta["accuracy"][metric] = scorer.scores[metric]
|
||||
else:
|
||||
meta.setdefault("beam_accuracy", {})
|
||||
meta.setdefault("beam_speed", {})
|
||||
for component in nlp.pipe_names:
|
||||
for metric in _get_metrics(component):
|
||||
meta["beam_accuracy"][metric] = scorer.scores[metric]
|
||||
meta["beam_speed"][beam_width] = {
|
||||
"nwords": nwords,
|
||||
"cpu": cpu_wps,
|
||||
"gpu": gpu_wps,
|
||||
}
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
"vectors": len(nlp.vocab.vectors),
|
||||
"keys": nlp.vocab.vectors.n_keys,
|
||||
"name": nlp.vocab.vectors.name,
|
||||
}
|
||||
meta.setdefault("name", "model%d" % i)
|
||||
meta.setdefault("version", version)
|
||||
meta["labels"] = nlp.meta["labels"]
|
||||
meta_loc = output_path / ("model%d" % i) / "meta.json"
|
||||
srsly.write_json(meta_loc, meta)
|
||||
util.set_env_log(verbose)
|
||||
|
||||
progress = _get_progress(
|
||||
i,
|
||||
losses,
|
||||
scorer.scores,
|
||||
output_stats,
|
||||
beam_width=beam_width if has_beam_widths else None,
|
||||
cpu_wps=cpu_wps,
|
||||
gpu_wps=gpu_wps,
|
||||
)
|
||||
if i == 0 and "textcat" in pipeline:
|
||||
textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
|
||||
for cat, cat_score in textcats_per_cat.items():
|
||||
if cat_score.get("roc_auc_score", 0) < 0:
|
||||
msg.warn(
|
||||
"Textcat ROC AUC score is undefined due to "
|
||||
"only one value in label '{}'.".format(cat)
|
||||
)
|
||||
msg.row(progress, **row_settings)
|
||||
# Early stopping
|
||||
if n_early_stopping is not None:
|
||||
current_score = _score_for_model(meta)
|
||||
if current_score < best_score:
|
||||
iter_since_best += 1
|
||||
else:
|
||||
iter_since_best = 0
|
||||
best_score = current_score
|
||||
if iter_since_best >= n_early_stopping:
|
||||
msg.text(
|
||||
"Early stopping, best iteration "
|
||||
"is: {}".format(i - iter_since_best)
|
||||
)
|
||||
msg.text(
|
||||
"Best score = {}; Final iteration "
|
||||
"score = {}".format(best_score, current_score)
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
msg.warn(
|
||||
"Aborting and saving the final best model. "
|
||||
"Encountered exception: {}".format(e),
|
||||
exits=1,
|
||||
)
|
||||
finally:
|
||||
best_pipes = nlp.pipe_names
|
||||
if disabled_pipes:
|
||||
disabled_pipes.restore()
|
||||
with nlp.use_params(optimizer.averages):
|
||||
final_model_path = output_path / "model-final"
|
||||
nlp.to_disk(final_model_path)
|
||||
meta_loc = output_path / "model-final" / "meta.json"
|
||||
final_meta = srsly.read_json(meta_loc)
|
||||
final_meta.setdefault("accuracy", {})
|
||||
final_meta["accuracy"].update(meta.get("accuracy", {}))
|
||||
final_meta.setdefault("speed", {})
|
||||
final_meta["speed"].setdefault("cpu", None)
|
||||
final_meta["speed"].setdefault("gpu", None)
|
||||
meta.setdefault("speed", {})
|
||||
meta["speed"].setdefault("cpu", None)
|
||||
meta["speed"].setdefault("gpu", None)
|
||||
# combine cpu and gpu speeds with the base model speeds
|
||||
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
|
||||
speed = _get_total_speed(
|
||||
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
|
||||
)
|
||||
final_meta["speed"]["cpu"] = speed
|
||||
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
|
||||
speed = _get_total_speed(
|
||||
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
|
||||
)
|
||||
final_meta["speed"]["gpu"] = speed
|
||||
# if there were no speeds to update, overwrite with meta
|
||||
if (
|
||||
final_meta["speed"]["cpu"] is None
|
||||
and final_meta["speed"]["gpu"] is None
|
||||
):
|
||||
final_meta["speed"].update(meta["speed"])
|
||||
# note: beam speeds are not combined with the base model
|
||||
if has_beam_widths:
|
||||
final_meta.setdefault("beam_accuracy", {})
|
||||
final_meta["beam_accuracy"].update(meta.get("beam_accuracy", {}))
|
||||
final_meta.setdefault("beam_speed", {})
|
||||
final_meta["beam_speed"].update(meta.get("beam_speed", {}))
|
||||
srsly.write_json(meta_loc, final_meta)
|
||||
msg.good("Saved model to output directory", final_model_path)
|
||||
with msg.loading("Creating best model..."):
|
||||
best_model_path = _collate_best_model(final_meta, output_path, best_pipes)
|
||||
msg.good("Created best model", best_model_path)
|
||||
|
||||
|
||||
def _score_for_model(meta):
|
||||
""" Returns mean score between tasks in pipeline that can be used for early stopping. """
|
||||
mean_acc = list()
|
||||
pipes = meta["pipeline"]
|
||||
acc = meta["accuracy"]
|
||||
if "tagger" in pipes:
|
||||
mean_acc.append(acc["tags_acc"])
|
||||
if "parser" in pipes:
|
||||
mean_acc.append((acc["uas"] + acc["las"]) / 2)
|
||||
if "ner" in pipes:
|
||||
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
|
||||
if "textcat" in pipes:
|
||||
mean_acc.append(acc["textcat_score"])
|
||||
return sum(mean_acc) / len(mean_acc)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _create_progress_bar(total):
|
||||
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||
yield
|
||||
else:
|
||||
pbar = tqdm.tqdm(total=total, leave=False)
|
||||
yield pbar
|
||||
|
||||
|
||||
def _load_vectors(nlp, vectors):
|
||||
util.load_model(vectors, vocab=nlp.vocab)
|
||||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc, base_components):
|
||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with loc.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
loaded = []
|
||||
for name, component in nlp.pipeline:
|
||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||
if name in base_components:
|
||||
raise ValueError(Errors.E200.format(component=name))
|
||||
component.tok2vec.from_bytes(weights_data)
|
||||
loaded.append(name)
|
||||
return loaded
|
||||
|
||||
|
||||
def _collate_best_model(meta, output_path, components):
|
||||
bests = {}
|
||||
meta.setdefault("accuracy", {})
|
||||
for component in components:
|
||||
bests[component] = _find_best(output_path, component)
|
||||
best_dest = output_path / "model-best"
|
||||
shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
|
||||
for component, best_component_src in bests.items():
|
||||
shutil.rmtree(path2str(best_dest / component))
|
||||
shutil.copytree(
|
||||
path2str(best_component_src / component), path2str(best_dest / component)
|
||||
)
|
||||
accs = srsly.read_json(best_component_src / "accuracy.json")
|
||||
for metric in _get_metrics(component):
|
||||
meta["accuracy"][metric] = accs[metric]
|
||||
srsly.write_json(best_dest / "meta.json", meta)
|
||||
return best_dest
|
||||
|
||||
|
||||
def _find_best(experiment_dir, component):
|
||||
accuracies = []
|
||||
for epoch_model in experiment_dir.iterdir():
|
||||
if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
|
||||
accs = srsly.read_json(epoch_model / "accuracy.json")
|
||||
scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
|
||||
# remove per_type dicts from score list for max() comparison
|
||||
scores = [score for score in scores if isinstance(score, float)]
|
||||
accuracies.append((scores, epoch_model))
|
||||
if accuracies:
|
||||
return max(accuracies)[1]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def _get_metrics(component):
|
||||
if component == "parser":
|
||||
return ("las", "uas", "las_per_type", "token_acc")
|
||||
elif component == "tagger":
|
||||
return ("tags_acc", "token_acc")
|
||||
elif component == "ner":
|
||||
return ("ents_f", "ents_p", "ents_r", "ents_per_type", "token_acc")
|
||||
elif component == "textcat":
|
||||
return ("textcat_score", "token_acc")
|
||||
return ("token_acc",)
|
||||
|
||||
|
||||
def _configure_training_output(pipeline, use_gpu, has_beam_widths):
|
||||
row_head = ["Itn"]
|
||||
output_stats = []
|
||||
for pipe in pipeline:
|
||||
if pipe == "tagger":
|
||||
row_head.extend(["Tag Loss ", " Tag % "])
|
||||
output_stats.extend(["tag_loss", "tags_acc"])
|
||||
elif pipe == "parser":
|
||||
row_head.extend(["Dep Loss ", " UAS ", " LAS "])
|
||||
output_stats.extend(["dep_loss", "uas", "las"])
|
||||
elif pipe == "ner":
|
||||
row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
|
||||
output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
|
||||
elif pipe == "textcat":
|
||||
row_head.extend(["Textcat Loss", "Textcat"])
|
||||
output_stats.extend(["textcat_loss", "textcat_score"])
|
||||
row_head.extend(["Token %", "CPU WPS"])
|
||||
output_stats.extend(["token_acc", "cpu_wps"])
|
||||
|
||||
if use_gpu >= 0:
|
||||
row_head.extend(["GPU WPS"])
|
||||
output_stats.extend(["gpu_wps"])
|
||||
|
||||
if has_beam_widths:
|
||||
row_head.insert(1, "Beam W.")
|
||||
return row_head, output_stats
|
||||
|
||||
|
||||
def _get_progress(
|
||||
itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
|
||||
):
|
||||
scores = {}
|
||||
for stat in output_stats:
|
||||
scores[stat] = 0.0
|
||||
scores["dep_loss"] = losses.get("parser", 0.0)
|
||||
scores["ner_loss"] = losses.get("ner", 0.0)
|
||||
scores["tag_loss"] = losses.get("tagger", 0.0)
|
||||
scores["textcat_loss"] = losses.get("textcat", 0.0)
|
||||
scores["cpu_wps"] = cpu_wps
|
||||
scores["gpu_wps"] = gpu_wps or 0.0
|
||||
scores.update(dev_scores)
|
||||
formatted_scores = []
|
||||
for stat in output_stats:
|
||||
format_spec = "{:.3f}"
|
||||
if stat.endswith("_wps"):
|
||||
format_spec = "{:.0f}"
|
||||
formatted_scores.append(format_spec.format(scores[stat]))
|
||||
result = [itn + 1]
|
||||
result.extend(formatted_scores)
|
||||
if beam_width is not None:
|
||||
result.insert(1, beam_width)
|
||||
return result
|
||||
|
||||
|
||||
def _get_total_speed(speeds):
|
||||
seconds_per_word = 0.0
|
||||
for words_per_second in speeds:
|
||||
if words_per_second is None:
|
||||
return None
|
||||
seconds_per_word += 1.0 / words_per_second
|
||||
return 1.0 / seconds_per_word
|
606
spacy/cli/train_from_config.py
Normal file
606
spacy/cli/train_from_config.py
Normal file
|
@ -0,0 +1,606 @@
|
|||
from typing import Optional, Dict, List, Union, Sequence
|
||||
from timeit import default_timer as timer
|
||||
|
||||
import srsly
|
||||
from pydantic import BaseModel, FilePath
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import Model, use_pytorch_for_gpu_memory
|
||||
import random
|
||||
|
||||
from ..gold import GoldCorpus
|
||||
from ..lookups import Lookups
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
|
||||
# Don't remove - required to load the built-in architectures
|
||||
from ..ml import models # noqa: F401
|
||||
|
||||
registry = util.registry
|
||||
|
||||
CONFIG_STR = """
|
||||
[training]
|
||||
patience = 10
|
||||
eval_frequency = 10
|
||||
dropout = 0.2
|
||||
init_tok2vec = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
gold_preproc = false
|
||||
max_length = 0
|
||||
use_gpu = 0
|
||||
scores = ["ents_p", "ents_r", "ents_f"]
|
||||
score_weights = {"ents_f": 1.0}
|
||||
limit = 0
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 100
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
learn_rate = 0.001
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = null
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 64
|
||||
maxout_pieces = 3
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 128
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
"""
|
||||
|
||||
|
||||
class PipelineComponent(BaseModel):
|
||||
factory: str
|
||||
model: Model
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
class ConfigSchema(BaseModel):
|
||||
optimizer: Optional["Optimizer"]
|
||||
|
||||
class training(BaseModel):
|
||||
patience: int = 10
|
||||
eval_frequency: int = 100
|
||||
dropout: float = 0.2
|
||||
init_tok2vec: Optional[FilePath] = None
|
||||
max_epochs: int = 100
|
||||
orth_variant_level: float = 0.0
|
||||
gold_preproc: bool = False
|
||||
max_length: int = 0
|
||||
use_gpu: int = 0
|
||||
scores: List[str] = ["ents_p", "ents_r", "ents_f"]
|
||||
score_weights: Dict[str, Union[int, float]] = {"ents_f": 1.0}
|
||||
limit: int = 0
|
||||
batch_size: Union[Sequence[int], int]
|
||||
|
||||
class nlp(BaseModel):
|
||||
lang: str
|
||||
vectors: Optional[str]
|
||||
pipeline: Optional[Dict[str, PipelineComponent]]
|
||||
|
||||
class Config:
|
||||
extra = "allow"
|
||||
|
||||
|
||||
def train_cli(
|
||||
# fmt: off
|
||||
train_path: ("Location of JSON-formatted training data", "positional", None, Path),
|
||||
dev_path: ("Location of JSON-formatted development data", "positional", None, Path),
|
||||
config_path: ("Path to config file", "positional", None, Path),
|
||||
output_path: ("Output directory to store model in", "option", "o", Path) = None,
|
||||
init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None,
|
||||
raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None,
|
||||
verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False,
|
||||
use_gpu: ("Use GPU", "option", "g", int) = -1,
|
||||
tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None,
|
||||
omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False,
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Train or update a spaCy model. Requires data to be formatted in spaCy's
|
||||
JSON format. To convert data from other formats, use the `spacy convert`
|
||||
command.
|
||||
"""
|
||||
util.set_env_log(verbose)
|
||||
|
||||
# Make sure all files and paths exists if they are needed
|
||||
if not config_path or not config_path.exists():
|
||||
msg.fail("Config file not found", config_path, exits=1)
|
||||
if not train_path or not train_path.exists():
|
||||
msg.fail("Training data not found", train_path, exits=1)
|
||||
if not dev_path or not dev_path.exists():
|
||||
msg.fail("Development data not found", dev_path, exits=1)
|
||||
if output_path is not None:
|
||||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
msg.good(f"Created output directory: {output_path}")
|
||||
elif output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
|
||||
msg.warn(
|
||||
"Output directory is not empty.",
|
||||
"This can lead to unintended side effects when saving the model. "
|
||||
"Please use an empty directory or a different path instead. If "
|
||||
"the specified output path doesn't exist, the directory will be "
|
||||
"created for you.",
|
||||
)
|
||||
if raw_text is not None:
|
||||
raw_text = list(srsly.read_jsonl(raw_text))
|
||||
tag_map = {}
|
||||
if tag_map_path is not None:
|
||||
tag_map = srsly.read_json(tag_map_path)
|
||||
|
||||
weights_data = None
|
||||
if init_tok2vec is not None:
|
||||
if not init_tok2vec.exists():
|
||||
msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
|
||||
with init_tok2vec.open("rb") as file_:
|
||||
weights_data = file_.read()
|
||||
|
||||
if use_gpu >= 0:
|
||||
msg.info("Using GPU: {use_gpu}")
|
||||
util.use_gpu(use_gpu)
|
||||
else:
|
||||
msg.info("Using CPU")
|
||||
|
||||
train(
|
||||
config_path,
|
||||
{"train": train_path, "dev": dev_path},
|
||||
output_path=output_path,
|
||||
raw_text=raw_text,
|
||||
tag_map=tag_map,
|
||||
weights_data=weights_data,
|
||||
omit_extra_lookups=omit_extra_lookups,
|
||||
)
|
||||
|
||||
|
||||
def train(
|
||||
config_path,
|
||||
data_paths,
|
||||
raw_text=None,
|
||||
output_path=None,
|
||||
tag_map=None,
|
||||
weights_data=None,
|
||||
omit_extra_lookups=False,
|
||||
):
|
||||
msg.info(f"Loading config from: {config_path}")
|
||||
# Read the config first without creating objects, to get to the original nlp_config
|
||||
config = util.load_config(config_path, create_objects=False)
|
||||
util.fix_random_seed(config["training"]["seed"])
|
||||
if config["training"].get("use_pytorch_for_gpu_memory"):
|
||||
# It feels kind of weird to not have a default for this.
|
||||
use_pytorch_for_gpu_memory()
|
||||
nlp_config = config["nlp"]
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
training = config["training"]
|
||||
msg.info("Creating nlp from config")
|
||||
nlp = util.load_model_from_config(nlp_config)
|
||||
optimizer = training["optimizer"]
|
||||
limit = training["limit"]
|
||||
msg.info("Loading training corpus")
|
||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||
|
||||
# verify textcat config
|
||||
if "textcat" in nlp_config["pipeline"]:
|
||||
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
||||
textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][
|
||||
"exclusive_classes"
|
||||
]
|
||||
|
||||
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
||||
if textcat_multilabel:
|
||||
multilabel_found = False
|
||||
for ex in corpus.train_examples:
|
||||
cats = ex.doc_annotation.cats
|
||||
textcat_labels.update(cats.keys())
|
||||
if list(cats.values()).count(1.0) != 1:
|
||||
multilabel_found = True
|
||||
if not multilabel_found:
|
||||
msg.warn(
|
||||
"The textcat training instances look like they have "
|
||||
"mutually exclusive classes. Set 'exclusive_classes' "
|
||||
"to 'true' in the config to train a classifier with "
|
||||
"mutually exclusive classes more accurately."
|
||||
)
|
||||
else:
|
||||
for ex in corpus.train_examples:
|
||||
cats = ex.doc_annotation.cats
|
||||
textcat_labels.update(cats.keys())
|
||||
if list(cats.values()).count(1.0) != 1:
|
||||
msg.fail(
|
||||
"Some textcat training instances do not have exactly "
|
||||
"one positive label. Set 'exclusive_classes' "
|
||||
"to 'false' in the config to train a classifier with classes "
|
||||
"that are not mutually exclusive."
|
||||
)
|
||||
msg.info(
|
||||
f"Initialized textcat component for {len(textcat_labels)} unique labels"
|
||||
)
|
||||
nlp.get_pipe("textcat").labels = tuple(textcat_labels)
|
||||
|
||||
# if 'positive_label' is provided: double check whether it's in the data and the task is binary
|
||||
if nlp_config["pipeline"]["textcat"].get("positive_label", None):
|
||||
textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
|
||||
pos_label = nlp_config["pipeline"]["textcat"]["positive_label"]
|
||||
if pos_label not in textcat_labels:
|
||||
msg.fail(
|
||||
f"The textcat's 'positive_label' config setting '{pos_label}' "
|
||||
f"does not match any label in the training data.",
|
||||
exits=1,
|
||||
)
|
||||
if len(textcat_labels) != 2:
|
||||
msg.fail(
|
||||
f"A textcat 'positive_label' '{pos_label}' was "
|
||||
f"provided for training data that does not appear to be a "
|
||||
f"binary classification problem with two labels.",
|
||||
exits=1,
|
||||
)
|
||||
|
||||
if training.get("resume", False):
|
||||
msg.info("Resuming training")
|
||||
nlp.resume_training()
|
||||
else:
|
||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||
nlp.begin_training(lambda: corpus.train_examples)
|
||||
|
||||
# Update tag map with provided mapping
|
||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||
|
||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||
# isn't loaded if these features are accessed
|
||||
if omit_extra_lookups:
|
||||
nlp.vocab.lookups_extra = Lookups()
|
||||
nlp.vocab.lookups_extra.add_table("lexeme_cluster")
|
||||
nlp.vocab.lookups_extra.add_table("lexeme_prob")
|
||||
nlp.vocab.lookups_extra.add_table("lexeme_settings")
|
||||
|
||||
# Load a pretrained tok2vec model - cf. CLI command 'pretrain'
|
||||
if weights_data is not None:
|
||||
tok2vec_path = config.get("pretraining", {}).get("tok2vec_model", None)
|
||||
if tok2vec_path is None:
|
||||
msg.fail(
|
||||
f"To use a pretrained tok2vec model, the config needs to specify which "
|
||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||
exits=1,
|
||||
)
|
||||
tok2vec = config
|
||||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
if not tok2vec:
|
||||
msg.fail(
|
||||
f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1,
|
||||
)
|
||||
tok2vec.from_bytes(weights_data)
|
||||
|
||||
train_batches = create_train_batches(nlp, corpus, training)
|
||||
evaluate = create_evaluation_callback(nlp, optimizer, corpus, training)
|
||||
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
msg.info("Start training")
|
||||
training_step_iterator = train_while_improving(
|
||||
nlp,
|
||||
optimizer,
|
||||
train_batches,
|
||||
evaluate,
|
||||
dropout=training["dropout"],
|
||||
accumulate_gradient=training["accumulate_gradient"],
|
||||
patience=training.get("patience", 0),
|
||||
max_steps=training.get("max_steps", 0),
|
||||
eval_frequency=training["eval_frequency"],
|
||||
raw_text=raw_text,
|
||||
)
|
||||
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
print_row = setup_printer(training, nlp)
|
||||
|
||||
try:
|
||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||
for batch, info, is_best_checkpoint in training_step_iterator:
|
||||
progress.update(1)
|
||||
if is_best_checkpoint is not None:
|
||||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
update_meta(training, nlp, info)
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||
# Clean up the objects to faciliate garbage collection.
|
||||
for eg in batch:
|
||||
eg.doc = None
|
||||
eg.goldparse = None
|
||||
eg.doc_annotation = None
|
||||
eg.token_annotation = None
|
||||
except Exception as e:
|
||||
msg.warn(
|
||||
f"Aborting and saving the final best model. "
|
||||
f"Encountered exception: {str(e)}",
|
||||
exits=1,
|
||||
)
|
||||
finally:
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / "model-final"
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(final_model_path)
|
||||
else:
|
||||
nlp.to_disk(final_model_path)
|
||||
msg.good(f"Saved model to output directory {final_model_path}")
|
||||
|
||||
|
||||
def create_train_batches(nlp, corpus, cfg):
|
||||
epochs_todo = cfg.get("max_epochs", 0)
|
||||
while True:
|
||||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp,
|
||||
noise_level=0.0, # I think this is deprecated?
|
||||
orth_variant_level=cfg["orth_variant_level"],
|
||||
gold_preproc=cfg["gold_preproc"],
|
||||
max_length=cfg["max_length"],
|
||||
ignore_misaligned=True,
|
||||
)
|
||||
)
|
||||
if len(train_examples) == 0:
|
||||
raise ValueError(Errors.E988)
|
||||
random.shuffle(train_examples)
|
||||
batches = util.minibatch_by_words(
|
||||
train_examples,
|
||||
size=cfg["batch_size"],
|
||||
discard_oversize=cfg["discard_oversize"],
|
||||
)
|
||||
# make sure the minibatch_by_words result is not empty, or we'll have an infinite training loop
|
||||
try:
|
||||
first = next(batches)
|
||||
yield first
|
||||
except StopIteration:
|
||||
raise ValueError(Errors.E986)
|
||||
for batch in batches:
|
||||
yield batch
|
||||
epochs_todo -= 1
|
||||
# We intentionally compare exactly to 0 here, so that max_epochs < 1
|
||||
# will not break.
|
||||
if epochs_todo == 0:
|
||||
break
|
||||
|
||||
|
||||
def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||
def evaluate():
|
||||
dev_examples = list(
|
||||
corpus.dev_dataset(
|
||||
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
||||
)
|
||||
)
|
||||
n_words = sum(len(ex.doc) for ex in dev_examples)
|
||||
start_time = timer()
|
||||
|
||||
if optimizer.averages:
|
||||
with nlp.use_params(optimizer.averages):
|
||||
scorer = nlp.evaluate(dev_examples, batch_size=32)
|
||||
else:
|
||||
scorer = nlp.evaluate(dev_examples, batch_size=32)
|
||||
end_time = timer()
|
||||
wps = n_words / (end_time - start_time)
|
||||
scores = scorer.scores
|
||||
# Calculate a weighted sum based on score_weights for the main score
|
||||
weights = cfg["score_weights"]
|
||||
try:
|
||||
weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights)
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict_name="score_weights", key=str(e), keys=list(scores.keys())
|
||||
)
|
||||
)
|
||||
|
||||
scores["speed"] = wps
|
||||
return weighted_score, scores
|
||||
|
||||
return evaluate
|
||||
|
||||
|
||||
def train_while_improving(
|
||||
nlp,
|
||||
optimizer,
|
||||
train_data,
|
||||
evaluate,
|
||||
*,
|
||||
dropout,
|
||||
eval_frequency,
|
||||
accumulate_gradient=1,
|
||||
patience=0,
|
||||
max_steps=0,
|
||||
raw_text=None,
|
||||
):
|
||||
"""Train until an evaluation stops improving. Works as a generator,
|
||||
with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
|
||||
where info is a dict, and is_best_checkpoint is in [True, False, None] --
|
||||
None indicating that the iteration was not evaluated as a checkpoint.
|
||||
The evaluation is conducted by calling the evaluate callback, which should
|
||||
|
||||
Positional arguments:
|
||||
nlp: The spaCy pipeline to evaluate.
|
||||
optimizer: The optimizer callable.
|
||||
train_data (Iterable[Batch]): A generator of batches, with the training
|
||||
data. Each batch should be a Sized[Tuple[Input, Annot]]. The training
|
||||
data iterable needs to take care of iterating over the epochs and
|
||||
shuffling.
|
||||
evaluate (Callable[[], Tuple[float, Any]]): A callback to perform evaluation.
|
||||
The callback should take no arguments and return a tuple
|
||||
`(main_score, other_scores)`. The main_score should be a float where
|
||||
higher is better. other_scores can be any object.
|
||||
|
||||
Every iteration, the function yields out a tuple with:
|
||||
|
||||
* batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
|
||||
* info: A dict with various information about the last update (see below).
|
||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
||||
was the best evaluation so far. You should use this to save the model
|
||||
checkpoints during training. If None, evaluation was not conducted on
|
||||
that iteration. False means evaluation was conducted, but a previous
|
||||
evaluation was better.
|
||||
|
||||
The info dict provides the following information:
|
||||
|
||||
epoch (int): How many passes over the data have been completed.
|
||||
step (int): How many steps have been completed.
|
||||
score (float): The main score form the last evaluation.
|
||||
other_scores: : The other scores from the last evaluation.
|
||||
loss: The accumulated losses throughout training.
|
||||
checkpoints: A list of previous results, where each result is a
|
||||
(score, step, epoch) tuple.
|
||||
"""
|
||||
if isinstance(dropout, float):
|
||||
dropouts = thinc.schedules.constant(dropout)
|
||||
else:
|
||||
dropouts = dropout
|
||||
results = []
|
||||
losses = {}
|
||||
to_enable = [name for name, proc in nlp.pipeline if hasattr(proc, "model")]
|
||||
|
||||
if raw_text:
|
||||
random.shuffle(raw_text)
|
||||
raw_batches = util.minibatch(
|
||||
(nlp.make_doc(rt["text"]) for rt in raw_text), size=8
|
||||
)
|
||||
|
||||
for step, batch in enumerate(train_data):
|
||||
dropout = next(dropouts)
|
||||
with nlp.select_pipes(enable=to_enable):
|
||||
for subbatch in subdivide_batch(batch, accumulate_gradient):
|
||||
nlp.update(subbatch, drop=dropout, losses=losses, sgd=False)
|
||||
if raw_text:
|
||||
# If raw text is available, perform 'rehearsal' updates,
|
||||
# which use unlabelled data to reduce overfitting.
|
||||
raw_batch = list(next(raw_batches))
|
||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
||||
for name, proc in nlp.pipeline:
|
||||
if hasattr(proc, "model"):
|
||||
proc.model.finish_update(optimizer)
|
||||
optimizer.step_schedules()
|
||||
if not (step % eval_frequency):
|
||||
score, other_scores = evaluate()
|
||||
results.append((score, step))
|
||||
is_best_checkpoint = score == max(results)[0]
|
||||
else:
|
||||
score, other_scores = (None, None)
|
||||
is_best_checkpoint = None
|
||||
info = {
|
||||
"step": step,
|
||||
"score": score,
|
||||
"other_scores": other_scores,
|
||||
"losses": losses,
|
||||
"checkpoints": results,
|
||||
}
|
||||
yield batch, info, is_best_checkpoint
|
||||
if is_best_checkpoint is not None:
|
||||
losses = {}
|
||||
# Stop if no improvement in `patience` updates (if specified)
|
||||
best_score, best_step = max(results)
|
||||
if patience and (step - best_step) >= patience:
|
||||
break
|
||||
# Stop if we've exhausted our max steps (if specified)
|
||||
if max_steps and (step * accumulate_gradient) >= max_steps:
|
||||
break
|
||||
|
||||
|
||||
def subdivide_batch(batch, accumulate_gradient):
|
||||
batch = list(batch)
|
||||
batch.sort(key=lambda eg: len(eg.doc))
|
||||
sub_len = len(batch) // accumulate_gradient
|
||||
start = 0
|
||||
for i in range(accumulate_gradient):
|
||||
subbatch = batch[start : start + sub_len]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
start += len(subbatch)
|
||||
subbatch = batch[start:]
|
||||
if subbatch:
|
||||
yield subbatch
|
||||
|
||||
|
||||
def setup_printer(training, nlp):
|
||||
score_cols = training["scores"]
|
||||
score_widths = [max(len(col), 6) for col in score_cols]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||
table_header = ["#"] + loss_cols + score_cols + ["Score"]
|
||||
table_header = [col.upper() for col in table_header]
|
||||
table_widths = [6] + loss_widths + score_widths + [6]
|
||||
table_aligns = ["r" for _ in table_widths]
|
||||
|
||||
msg.row(table_header, widths=table_widths)
|
||||
msg.row(["-" * width for width in table_widths])
|
||||
|
||||
def print_row(info):
|
||||
try:
|
||||
losses = [
|
||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||
for pipe_name in nlp.pipe_names
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict_name="scores (losses)",
|
||||
key=str(e),
|
||||
keys=list(info["losses"].keys()),
|
||||
)
|
||||
)
|
||||
|
||||
try:
|
||||
scores = [
|
||||
"{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
Errors.E983.format(
|
||||
dict_name="scores (other)",
|
||||
key=str(e),
|
||||
keys=list(info["other_scores"].keys()),
|
||||
)
|
||||
)
|
||||
data = (
|
||||
[info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]
|
||||
)
|
||||
msg.row(data, widths=table_widths, aligns=table_aligns)
|
||||
|
||||
return print_row
|
||||
|
||||
|
||||
def update_meta(training, nlp, info):
|
||||
score_cols = training["scores"]
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in score_cols:
|
||||
nlp.meta["performance"][metric] = info["other_scores"][metric]
|
||||
for pipe_name in nlp.pipe_names:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
|
@ -1,15 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import requests
|
||||
import srsly
|
||||
from wasabi import msg
|
||||
|
||||
from ..compat import path2str
|
||||
from ..util import get_data_path
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, get_base_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||
|
||||
|
||||
def validate():
|
||||
|
@ -17,51 +13,30 @@ def validate():
|
|||
Validate that the currently installed version of spaCy is compatible
|
||||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
"Server error ({})".format(r.status_code),
|
||||
"Couldn't fetch compatibility table.",
|
||||
exits=1,
|
||||
)
|
||||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
version = about.__version__
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
current_compat = compat.get(version)
|
||||
model_pkgs, compat = get_model_pkgs()
|
||||
spacy_version = get_base_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
if not current_compat:
|
||||
msg.fail(
|
||||
"Can't find spaCy v{} in compatibility table".format(version),
|
||||
about.__compatibility__,
|
||||
exits=1,
|
||||
)
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
model_links = get_model_links(current_compat)
|
||||
model_pkgs = get_model_pkgs(current_compat, all_models)
|
||||
incompat_links = {l for l, d in model_links.items() if not d["compat"]}
|
||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
||||
incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
|
||||
incompat_models.update(
|
||||
[d["name"] for _, d in model_links.items() if not d["compat"]]
|
||||
)
|
||||
na_models = [m for m in incompat_models if m not in current_compat]
|
||||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
spacy_dir = Path(__file__).parent.parent
|
||||
|
||||
msg.divider("Installed models (spaCy v{})".format(about.__version__))
|
||||
msg.info("spaCy installation: {}".format(path2str(spacy_dir)))
|
||||
msg.divider(f"Installed models (spaCy v{about.__version__})")
|
||||
msg.info(f"spaCy installation: {spacy_dir}")
|
||||
|
||||
if model_links or model_pkgs:
|
||||
header = ("TYPE", "NAME", "MODEL", "VERSION", "")
|
||||
if model_pkgs:
|
||||
header = ("NAME", "SPACY", "VERSION", "")
|
||||
rows = []
|
||||
for name, data in model_pkgs.items():
|
||||
rows.append(get_model_row(current_compat, name, data, msg))
|
||||
for name, data in model_links.items():
|
||||
rows.append(get_model_row(current_compat, name, data, msg, "link"))
|
||||
if data["compat"]:
|
||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||
rows.append((data["name"], data["spacy"], version, comp))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
msg.text("No models found in your current environment.", exits=0)
|
||||
|
@ -71,75 +46,51 @@ def validate():
|
|||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
msg.text(
|
||||
"The following models are not available for spaCy "
|
||||
"v{}: {}".format(about.__version__, ", ".join(na_models))
|
||||
msg.info(
|
||||
f"The following models are custom spaCy models or not "
|
||||
f"available for spaCy v{about.__version__}:",
|
||||
", ".join(na_models),
|
||||
)
|
||||
if incompat_links:
|
||||
msg.text(
|
||||
"You may also want to overwrite the incompatible links using the "
|
||||
"`python -m spacy link` command with `--force`, or remove them "
|
||||
"from the data directory. "
|
||||
"Data path: {path}".format(path=path2str(get_data_path()))
|
||||
)
|
||||
if incompat_models or incompat_links:
|
||||
if incompat_models:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def get_model_links(compat):
|
||||
links = {}
|
||||
data_path = get_data_path()
|
||||
if data_path:
|
||||
models = [p for p in data_path.iterdir() if is_model_path(p)]
|
||||
for model in models:
|
||||
meta_path = Path(model) / "meta.json"
|
||||
if not meta_path.exists():
|
||||
continue
|
||||
meta = srsly.read_json(meta_path)
|
||||
link = model.parts[-1]
|
||||
name = meta["lang"] + "_" + meta["name"]
|
||||
links[link] = {
|
||||
"name": name,
|
||||
"version": meta["version"],
|
||||
"compat": is_compat(compat, name, meta["version"]),
|
||||
}
|
||||
return links
|
||||
|
||||
|
||||
def get_model_pkgs(compat, all_models):
|
||||
import pkg_resources
|
||||
|
||||
def get_model_pkgs():
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
msg.fail(
|
||||
f"Server error ({r.status_code})",
|
||||
"Couldn't fetch compatibility table.",
|
||||
exits=1,
|
||||
)
|
||||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
all_models = set()
|
||||
installed_models = get_installed_models()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
for pkg_name in installed_models:
|
||||
package = pkg_name.replace("-", "_")
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"compat": is_compat(compat, package, version),
|
||||
}
|
||||
return pkgs
|
||||
|
||||
|
||||
def get_model_row(compat, name, data, msg, model_type="package"):
|
||||
if data["compat"]:
|
||||
comp = msg.text("", color="green", icon="good", no_print=True)
|
||||
version = msg.text(data["version"], color="green", no_print=True)
|
||||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0])
|
||||
return (model_type, name, data["name"], version, comp)
|
||||
|
||||
|
||||
def is_model_path(model_path):
|
||||
exclude = ["cache", "pycache", "__pycache__"]
|
||||
name = model_path.parts[-1]
|
||||
return model_path.is_dir() and name not in exclude and not name.startswith(".")
|
||||
|
||||
|
||||
def is_compat(compat, name, version):
|
||||
return name in compat and version in compat[name]
|
||||
version = get_package_version(pkg_name)
|
||||
if package in compat:
|
||||
is_compat = version in compat[package]
|
||||
spacy_version = about.__version__
|
||||
else:
|
||||
model_path = get_package_path(package)
|
||||
model_meta = get_model_meta(model_path)
|
||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"spacy": spacy_version,
|
||||
"compat": is_compat,
|
||||
}
|
||||
return pkgs, compat
|
||||
|
||||
|
||||
def reformat_version(version):
|
||||
|
|
129
spacy/compat.py
129
spacy/compat.py
|
@ -1,4 +1,3 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
Helpers for Python and platform compatibility. To distinguish them from
|
||||
the builtin functions, replacement functions are suffixed with an underscore,
|
||||
|
@ -6,15 +5,9 @@ e.g. `unicode_`.
|
|||
|
||||
DOCS: https://spacy.io/api/top-level#compat
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import itertools
|
||||
import ast
|
||||
import types
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
from thinc.util import copy_array
|
||||
|
||||
try:
|
||||
import cPickle as pickle
|
||||
|
@ -36,91 +29,23 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try:
|
||||
from thinc.neural.optimizers import Optimizer # noqa: F401
|
||||
except ImportError:
|
||||
from thinc.neural.optimizers import Adam as Optimizer # noqa: F401
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
copy_reg = copy_reg
|
||||
CudaStream = CudaStream
|
||||
cupy = cupy
|
||||
copy_array = copy_array
|
||||
izip = getattr(itertools, "izip", zip)
|
||||
|
||||
is_windows = sys.platform.startswith("win")
|
||||
is_linux = sys.platform.startswith("linux")
|
||||
is_osx = sys.platform == "darwin"
|
||||
|
||||
# See: https://github.com/benjaminp/six/blob/master/six.py
|
||||
is_python2 = sys.version_info[0] == 2
|
||||
is_python3 = sys.version_info[0] == 3
|
||||
is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5)
|
||||
|
||||
if is_python2:
|
||||
bytes_ = str
|
||||
unicode_ = unicode # noqa: F821
|
||||
basestring_ = basestring # noqa: F821
|
||||
input_ = raw_input # noqa: F821
|
||||
path2str = lambda path: str(path).decode("utf8")
|
||||
class_types = (type, types.ClassType)
|
||||
|
||||
elif is_python3:
|
||||
bytes_ = bytes
|
||||
unicode_ = str
|
||||
basestring_ = str
|
||||
input_ = input
|
||||
path2str = lambda path: str(path)
|
||||
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
|
||||
|
||||
|
||||
def b_to_str(b_str):
|
||||
"""Convert a bytes object to a string.
|
||||
|
||||
b_str (bytes): The object to convert.
|
||||
RETURNS (unicode): The converted string.
|
||||
"""
|
||||
if is_python2:
|
||||
return b_str
|
||||
# Important: if no encoding is set, string becomes "b'...'"
|
||||
return str(b_str, encoding="utf8")
|
||||
|
||||
|
||||
def symlink_to(orig, dest):
|
||||
"""Create a symlink. Used for model shortcut links.
|
||||
|
||||
orig (unicode / Path): The origin path.
|
||||
dest (unicode / Path): The destination path of the symlink.
|
||||
"""
|
||||
if is_windows:
|
||||
import subprocess
|
||||
|
||||
subprocess.check_call(
|
||||
["mklink", "/d", path2str(orig), path2str(dest)], shell=True
|
||||
)
|
||||
else:
|
||||
orig.symlink_to(dest)
|
||||
|
||||
|
||||
def symlink_remove(link):
|
||||
"""Remove a symlink. Used for model shortcut links.
|
||||
|
||||
link (unicode / Path): The path to the symlink.
|
||||
"""
|
||||
# https://stackoverflow.com/q/26554135/6400719
|
||||
if os.path.isdir(path2str(link)) and is_windows:
|
||||
# this should only be on Py2.7 and windows
|
||||
os.rmdir(path2str(link))
|
||||
else:
|
||||
os.unlink(path2str(link))
|
||||
|
||||
|
||||
def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
||||
def is_config(windows=None, linux=None, osx=None, **kwargs):
|
||||
"""Check if a specific configuration of Python version and operating system
|
||||
matches the user's setup. Mostly used to display targeted error messages.
|
||||
|
||||
python2 (bool): spaCy is executed with Python 2.x.
|
||||
python3 (bool): spaCy is executed with Python 3.x.
|
||||
windows (bool): spaCy is executed on Windows.
|
||||
linux (bool): spaCy is executed on Linux.
|
||||
osx (bool): spaCy is executed on OS X or macOS.
|
||||
|
@ -129,53 +54,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None):
|
|||
DOCS: https://spacy.io/api/top-level#compat.is_config
|
||||
"""
|
||||
return (
|
||||
python2 in (None, is_python2)
|
||||
and python3 in (None, is_python3)
|
||||
and windows in (None, is_windows)
|
||||
windows in (None, is_windows)
|
||||
and linux in (None, is_linux)
|
||||
and osx in (None, is_osx)
|
||||
)
|
||||
|
||||
|
||||
def import_file(name, loc):
|
||||
"""Import module from a file. Used to load models from a directory.
|
||||
|
||||
name (unicode): Name of module to load.
|
||||
loc (unicode / Path): Path to the file.
|
||||
RETURNS: The loaded module.
|
||||
"""
|
||||
loc = path2str(loc)
|
||||
if is_python_pre_3_5:
|
||||
import imp
|
||||
|
||||
return imp.load_source(name, loc)
|
||||
else:
|
||||
import importlib.util
|
||||
|
||||
spec = importlib.util.spec_from_file_location(name, str(loc))
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def unescape_unicode(string):
|
||||
"""Python2.7's re module chokes when compiling patterns that have ranges
|
||||
between escaped unicode codepoints if the two codepoints are unrecognised
|
||||
in the unicode database. For instance:
|
||||
|
||||
re.compile('[\\uAA77-\\uAA79]').findall("hello")
|
||||
|
||||
Ends up matching every character (on Python 2). This problem doesn't occur
|
||||
if we're dealing with unicode literals.
|
||||
"""
|
||||
if string is None:
|
||||
return string
|
||||
# We only want to unescape the unicode, so we first must protect the other
|
||||
# backslashes.
|
||||
string = string.replace("\\", "\\\\")
|
||||
# Now we remove that protection for the unicode.
|
||||
string = string.replace("\\\\u", "\\u")
|
||||
string = string.replace("\\\\U", "\\U")
|
||||
# Now we unescape by evaling the string with the AST. This can't execute
|
||||
# code -- it only does the representational level.
|
||||
return ast.literal_eval("u'''" + string + "'''")
|
||||
|
|
|
@ -1,17 +1,13 @@
|
|||
# coding: utf8
|
||||
"""
|
||||
spaCy's built in visualization suite for dependencies and named entities.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import warnings
|
||||
|
||||
from .render import DependencyRenderer, EntityRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..compat import b_to_str
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
|
||||
|
@ -26,13 +22,13 @@ def render(
|
|||
"""Render displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
style (str): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -77,13 +73,13 @@ def serve(
|
|||
"""Serve displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
style (str): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (unicode): Host to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -95,20 +91,20 @@ def serve(
|
|||
|
||||
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
|
||||
httpd = simple_server.make_server(host, port, app)
|
||||
print("\nUsing the '{}' visualizer".format(style))
|
||||
print("Serving on http://{}:{} ...\n".format(host, port))
|
||||
print(f"\nUsing the '{style}' visualizer")
|
||||
print(f"Serving on http://{host}:{port} ...\n")
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
print("Shutting down server on port {}.".format(port))
|
||||
print(f"Shutting down server on port {port}.")
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
||||
def app(environ, start_response):
|
||||
# Headers and status need to be bytes in Python 2, see #1227
|
||||
headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))]
|
||||
start_response(b_to_str(b"200 OK"), headers)
|
||||
headers = [("Content-type", "text/html; charset=utf-8")]
|
||||
start_response("200 OK", headers)
|
||||
res = _html["parsed"].encode(encoding="utf-8")
|
||||
return [res]
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import uuid
|
||||
|
||||
from .templates import (
|
||||
|
@ -50,7 +47,7 @@ class DependencyRenderer(object):
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered SVG or HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
# Create a random ID prefix to make sure parses don't receive the
|
||||
# same ID, even if they're identical
|
||||
|
@ -61,7 +58,7 @@ class DependencyRenderer(object):
|
|||
settings = p.get("settings", {})
|
||||
self.direction = settings.get("direction", DEFAULT_DIR)
|
||||
self.lang = settings.get("lang", DEFAULT_LANG)
|
||||
render_id = "{}-{}".format(id_prefix, i)
|
||||
render_id = f"{id_prefix}-{i}"
|
||||
svg = self.render_svg(render_id, p["words"], p["arcs"])
|
||||
rendered.append(svg)
|
||||
if page:
|
||||
|
@ -81,7 +78,7 @@ class DependencyRenderer(object):
|
|||
render_id (int): Unique ID, typically index of document.
|
||||
words (list): Individual words and their tags.
|
||||
arcs (list): Individual arcs and their start, end, direction and label.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
self.levels = self.get_levels(arcs)
|
||||
self.highest_level = len(self.levels)
|
||||
|
@ -115,10 +112,10 @@ class DependencyRenderer(object):
|
|||
):
|
||||
"""Render individual word.
|
||||
|
||||
text (unicode): Word text.
|
||||
tag (unicode): Part-of-speech tag.
|
||||
text (str): Word text.
|
||||
tag (str): Part-of-speech tag.
|
||||
i (int): Unique ID, typically word index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
y = self.offset_y + self.word_spacing
|
||||
x = self.offset_x + i * self.distance
|
||||
|
@ -134,12 +131,12 @@ class DependencyRenderer(object):
|
|||
def render_arrow(self, label, start, end, direction, i):
|
||||
"""Render individual arrow.
|
||||
|
||||
label (unicode): Dependency label.
|
||||
label (str): Dependency label.
|
||||
start (int): Index of start word.
|
||||
end (int): Index of end word.
|
||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||
direction (str): Arrow direction, 'left' or 'right'.
|
||||
i (int): Unique ID, typically arrow index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
if start < 0 or end < 0:
|
||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||
|
@ -182,7 +179,7 @@ class DependencyRenderer(object):
|
|||
y (int): Y-coordinate of arrow start and end point.
|
||||
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
|
||||
x_end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arc path ('d' attribute).
|
||||
RETURNS (str): Definition of the arc path ('d' attribute).
|
||||
"""
|
||||
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
|
||||
if self.compact:
|
||||
|
@ -192,11 +189,11 @@ class DependencyRenderer(object):
|
|||
def get_arrowhead(self, direction, x, y, end):
|
||||
"""Render individual arrow head.
|
||||
|
||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||
direction (str): Arrow direction, 'left' or 'right'.
|
||||
x (int): X-coordinate of arrow start point.
|
||||
y (int): Y-coordinate of arrow start and end point.
|
||||
end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||
RETURNS (str): Definition of the arrow head path ('d' attribute).
|
||||
"""
|
||||
if direction == "left":
|
||||
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||
|
@ -282,7 +279,7 @@ class EntityRenderer(object):
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -303,9 +300,9 @@ class EntityRenderer(object):
|
|||
def render_ents(self, text, spans, title):
|
||||
"""Render entities in text.
|
||||
|
||||
text (unicode): Original text.
|
||||
text (str): Original text.
|
||||
spans (list): Individual entity spans and their start, end and label.
|
||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
markup = ""
|
||||
offset = 0
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Setting explicit height and max-width: none on the SVG is required for
|
||||
# Jupyter to render it properly in a cell
|
||||
|
||||
|
|
106
spacy/errors.py
106
spacy/errors.py
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def add_codes(err_cls):
|
||||
"""Add error codes to string messages via class attribute names."""
|
||||
|
||||
|
@ -93,7 +89,7 @@ class Warnings(object):
|
|||
"lemmatization rules or data. This means that the trained model "
|
||||
"may not be able to lemmatize correctly. If this is intentional "
|
||||
"or the language you're using doesn't have lemmatization data, "
|
||||
"please ignore this warning. If this is surprising, make sure you "
|
||||
"you can ignore this warning. If this is surprising, make sure you "
|
||||
"have the spacy-lookups-data package installed.")
|
||||
W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
|
||||
"'n_process' will be set to 1.")
|
||||
|
@ -135,6 +131,31 @@ class Warnings(object):
|
|||
"package installed. The languages with lexeme normalization tables "
|
||||
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
|
||||
"spaCy version requirement: {version}. This can lead to compatibility "
|
||||
"problems with older versions, or as new spaCy versions are "
|
||||
"released, because the model may say it's compatible when it's "
|
||||
'not. Consider changing the "spacy_version" in your meta.json to a '
|
||||
"version range, with a lower and upper pin. For example: {example}")
|
||||
W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
|
||||
"incompatible with the current version ({current}). This may lead "
|
||||
"to unexpected results or runtime errors. To resolve this, "
|
||||
"download a newer compatible model or retrain your custom model "
|
||||
"with the current spaCy version. For more details and available "
|
||||
"updates, run: python -m spacy validate")
|
||||
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
|
||||
"instead.")
|
||||
W097 = ("No Model config was provided to create the '{name}' component, "
|
||||
"and no default configuration could be found either.")
|
||||
W098 = ("No Model config was provided to create the '{name}' component, "
|
||||
"so a default configuration was used.")
|
||||
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
||||
"but got '{type}' instead, so ignoring it.")
|
||||
W100 = ("Skipping unsupported morphological feature(s): {feature}. "
|
||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||
|
||||
|
||||
@add_codes
|
||||
class Errors(object):
|
||||
|
@ -156,7 +177,7 @@ class Errors(object):
|
|||
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
||||
E008 = ("Some current components would be lost when restoring previous "
|
||||
"pipeline state. If you added components after calling "
|
||||
"`nlp.disable_pipes()`, you should remove them explicitly with "
|
||||
"`nlp.select_pipes()`, you should remove them explicitly with "
|
||||
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
||||
"the new components: {names}")
|
||||
E009 = ("The `update` method expects same number of docs and golds, but "
|
||||
|
@ -217,7 +238,7 @@ class Errors(object):
|
|||
"the documentation:\nhttps://spacy.io/usage/models")
|
||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')). "
|
||||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
|
@ -253,15 +274,10 @@ class Errors(object):
|
|||
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
||||
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
|
||||
"installation and permissions, or use spacy.util.set_data_path "
|
||||
"to customise the location if necessary.")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
|
||||
"link, a Python package or a valid path to a data directory.")
|
||||
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
|
||||
"it points to a valid package (not just a data directory).")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a Python "
|
||||
"package or a valid path to a data directory.")
|
||||
E052 = ("Can't find model directory: {path}")
|
||||
E053 = ("Could not read meta.json from {path}")
|
||||
E053 = ("Could not read {name} from {path}")
|
||||
E054 = ("No valid '{setting}' setting found in model meta.json.")
|
||||
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
|
||||
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
||||
|
@ -379,8 +395,8 @@ class Errors(object):
|
|||
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
||||
"in favor of the pipe name `sentencizer`, which does the same "
|
||||
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
||||
E109 = ("Model for component '{name}' not initialized. Did you forget to "
|
||||
"load a model, or forget to call begin_training()?")
|
||||
E109 = ("Component '{name}' could not be run. Did you forget to "
|
||||
"call begin_training()?")
|
||||
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
||||
E111 = ("Pickling a token is not supported, because tokens are only views "
|
||||
"of the parent Doc and can't exist on their own. A pickled token "
|
||||
|
@ -450,8 +466,6 @@ class Errors(object):
|
|||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
|
||||
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
|
||||
E136 = ("This additional feature requires the jsonschema library to be "
|
||||
"installed:\npip install jsonschema")
|
||||
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure "
|
||||
"to provide a valid JSON object as input with either the `text` "
|
||||
"or `tokens` key. For more info, see the docs:\n"
|
||||
|
@ -459,14 +473,11 @@ class Errors(object):
|
|||
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
|
||||
"includes either the `text` or `tokens` key. For more info, see "
|
||||
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
|
||||
E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
|
||||
"forget to call set_kb()?")
|
||||
E139 = ("Knowledge Base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
"provided {found}.")
|
||||
E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or "
|
||||
"'cosine'.")
|
||||
E143 = ("Labels for component '{name}' not initialized. Did you forget to "
|
||||
"call add_label()?")
|
||||
E144 = ("Could not find parameter `{param}` when building the entity "
|
||||
|
@ -590,6 +601,47 @@ class Errors(object):
|
|||
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
||||
"can not be combined with adding a pretrained Tok2Vec layer.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E983 = ("Invalid key for '{dict_name}': {key}. Available keys: "
|
||||
"{keys}")
|
||||
E984 = ("Could not parse the {input} - double check the data is written "
|
||||
"in the correct format as expected by spaCy.")
|
||||
E985 = ("The pipeline component '{component}' is already available in the base "
|
||||
"model. The settings in the component block in the config file are "
|
||||
"being ignored. If you want to replace this component instead, set "
|
||||
"'replace' to True in the training configuration.")
|
||||
E986 = ("Could not create any training batches: check your input. "
|
||||
"Perhaps discard_oversize should be set to False ?")
|
||||
E987 = ("The text of an example training instance is either a Doc or "
|
||||
"a string, but found {type} instead.")
|
||||
E988 = ("Could not parse any training examples. Ensure the data is "
|
||||
"formatted correctly.")
|
||||
E989 = ("'nlp.update()' was called with two positional arguments. This "
|
||||
"may be due to a backwards-incompatible change to the format "
|
||||
"of the training data in spaCy 3.0 onwards. The 'update' "
|
||||
"function should now be called with a batch of 'Example' "
|
||||
"objects, instead of (text, annotation) tuples. ")
|
||||
E990 = ("An entity linking component needs to be initialized with a "
|
||||
"KnowledgeBase object, but found {type} instead.")
|
||||
E991 = ("The function 'select_pipes' should be called with either a "
|
||||
"'disable' argument to list the names of the pipe components "
|
||||
"that should be disabled, or with an 'enable' argument that "
|
||||
"specifies which pipes should not be disabled.")
|
||||
E992 = ("The function `select_pipes` was called with `enable`={enable} "
|
||||
"and `disable`={disable} but that information is conflicting "
|
||||
"for the `nlp` pipeline with components {names}.")
|
||||
E993 = ("The config for 'nlp' should include either a key 'name' to "
|
||||
"refer to an existing model by name or path, or a key 'lang' "
|
||||
"to create a new blank model.")
|
||||
E996 = ("Could not parse {file}: {msg}")
|
||||
E997 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||
"'{token_attrs}'.")
|
||||
E998 = ("To create GoldParse objects from Example objects without a "
|
||||
"Doc, get_gold_parses() should be called with a Vocab object.")
|
||||
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||
"gold annotations: {gold_dict}")
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
|
@ -610,14 +662,14 @@ class MatchPatternError(ValueError):
|
|||
def __init__(self, key, errors):
|
||||
"""Custom error for validating match patterns.
|
||||
|
||||
key (unicode): The name of the matcher rule.
|
||||
key (str): The name of the matcher rule.
|
||||
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
||||
ID, i.e. the index of the added pattern.
|
||||
"""
|
||||
msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
|
||||
msg = f"Invalid token patterns for matcher rule '{key}'\n"
|
||||
for pattern_idx, error_msgs in errors.items():
|
||||
pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
|
||||
msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
|
||||
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
||||
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
||||
ValueError.__init__(self, msg)
|
||||
|
||||
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
||||
term (unicode): The term to explain.
|
||||
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
|
||||
term (str): The term to explain.
|
||||
RETURNS (str): The explanation, or `None` if not found in the glossary.
|
||||
|
||||
EXAMPLE:
|
||||
>>> spacy.explain(u'NORP')
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .structs cimport TokenC
|
||||
from .typedefs cimport attr_t
|
||||
from .syntax.transition_system cimport Transition
|
||||
|
||||
from .tokens import Doc
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
|
@ -19,23 +20,49 @@ cdef class GoldParse:
|
|||
cdef Pool mem
|
||||
|
||||
cdef GoldParseC c
|
||||
cdef readonly TokenAnnotation orig
|
||||
|
||||
cdef int length
|
||||
cdef public int loss
|
||||
cdef public list words
|
||||
cdef public list tags
|
||||
cdef public list morphology
|
||||
cdef public list pos
|
||||
cdef public list morphs
|
||||
cdef public list lemmas
|
||||
cdef public list sent_starts
|
||||
cdef public list heads
|
||||
cdef public list labels
|
||||
cdef public dict orths
|
||||
cdef public list ner
|
||||
cdef public list ents
|
||||
cdef public dict brackets
|
||||
cdef public object cats
|
||||
cdef public dict cats
|
||||
cdef public dict links
|
||||
|
||||
cdef readonly list cand_to_gold
|
||||
cdef readonly list gold_to_cand
|
||||
cdef readonly list orig_annot
|
||||
|
||||
|
||||
cdef class TokenAnnotation:
|
||||
cdef public list ids
|
||||
cdef public list words
|
||||
cdef public list tags
|
||||
cdef public list pos
|
||||
cdef public list morphs
|
||||
cdef public list lemmas
|
||||
cdef public list heads
|
||||
cdef public list deps
|
||||
cdef public list entities
|
||||
cdef public list sent_starts
|
||||
cdef public dict brackets_by_start
|
||||
|
||||
|
||||
cdef class DocAnnotation:
|
||||
cdef public object cats
|
||||
cdef public object links
|
||||
|
||||
|
||||
cdef class Example:
|
||||
cdef public object doc
|
||||
cdef public TokenAnnotation token_annotation
|
||||
cdef public DocAnnotation doc_annotation
|
||||
cdef public object goldparse
|
||||
|
|
826
spacy/gold.pyx
826
spacy/gold.pyx
File diff suppressed because it is too large
Load Diff
|
@ -1,15 +1,15 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
|
||||
from .vocab cimport Vocab
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from .structs cimport KBEntryC, AliasC
|
||||
|
||||
|
||||
ctypedef vector[KBEntryC] entry_vec
|
||||
ctypedef vector[AliasC] alias_vec
|
||||
ctypedef vector[float] float_vec
|
||||
|
|
13
spacy/kb.pyx
13
spacy/kb.pyx
|
@ -1,6 +1,4 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=True
|
||||
# coding: utf8
|
||||
# cython: infer_types=True, profile=True
|
||||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
|
@ -8,12 +6,11 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
|||
from libc.stdint cimport int32_t, int64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from os import path
|
||||
from pathlib import Path
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
||||
from .errors import Errors, Warnings
|
||||
|
||||
|
||||
|
@ -41,7 +38,7 @@ cdef class Candidate:
|
|||
|
||||
@property
|
||||
def entity_(self):
|
||||
"""RETURNS (unicode): ID/name of this entity in the KB"""
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
|
@ -51,7 +48,7 @@ cdef class Candidate:
|
|||
|
||||
@property
|
||||
def alias_(self):
|
||||
"""RETURNS (unicode): ID of the original alias"""
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
|
@ -445,6 +442,8 @@ cdef class KnowledgeBase:
|
|||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
if path.exists(loc):
|
||||
assert not path.isdir(loc), f"{loc} is directory"
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if path.exists(loc):
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/stopwords-iso/stopwords-af
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = set(
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
من
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Source: https://github.com/Alir3z4/stop-words
|
||||
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .tag_map import TAG_MAP
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import LEMMA, PRON_LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
অতএব অথচ অথবা অনুযায়ী অনেক অনেকে অনেকেই অন্তত অবধি অবশ্য অর্থাৎ অন্য অনুযায়ী অর্ধভাগে
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM
|
||||
|
||||
|
@ -14,8 +11,8 @@ TAG_MAP = {
|
|||
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"৳": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"৳": {POS: SYM, "SymType": "currency"},
|
||||
"#": {POS: SYM, "SymType": "numbersign"},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding=utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..punctuation import TOKENIZER_INFIXES
|
||||
from ..char_classes import ALPHA
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò
|
||||
|
|
|
@ -1,28 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ
|
||||
from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"CCONJ": {POS: CCONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB},
|
||||
"PART": {POS: PART},
|
||||
"SP": {POS: SPACE},
|
||||
}
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
|
@ -33,9 +30,9 @@ _exc["12m."] = [{ORTH: "12"}, {ORTH: "m.", LEMMA: "p.m."}]
|
|||
|
||||
for h in range(1, 12 + 1):
|
||||
for period in ["a.m.", "am"]:
|
||||
_exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "a.m."}]
|
||||
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "a.m."}]
|
||||
for period in ["p.m.", "pm"]:
|
||||
_exc["%d%s" % (h, period)] = [{ORTH: "%d" % h}, {ORTH: period, LEMMA: "p.m."}]
|
||||
_exc[f"{h}{period}"] = [{ORTH: f"{h}"}, {ORTH: period, LEMMA: "p.m."}]
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = _exc
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
split_chars = lambda char: list(char.strip().split(" "))
|
||||
merge_chars = lambda char: char.strip().replace(" ", "|")
|
||||
group_chars = lambda char: char.strip().replace(" ", "")
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user