mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 11:20:19 +03:00
Merge branch 'upstream_master' into add-cli-find-loc
This commit is contained in:
commit
6dc9ced1c8
60
.github/workflows/tests.yml
vendored
60
.github/workflows/tests.yml
vendored
|
@ -45,6 +45,12 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install flake8==5.0.4
|
python -m pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
- name: cython-lint
|
||||||
|
run: |
|
||||||
|
python -m pip install cython-lint -c requirements.txt
|
||||||
|
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||||
|
cython-lint spacy --ignore E501,W291,E266
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
name: Test
|
name: Test
|
||||||
needs: Validate
|
needs: Validate
|
||||||
|
@ -111,22 +117,22 @@ jobs:
|
||||||
- name: Test import
|
- name: Test import
|
||||||
run: python -W error -c "import spacy"
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
# - name: "Test download CLI"
|
- name: "Test download CLI"
|
||||||
# run: |
|
run: |
|
||||||
# python -m spacy download ca_core_news_sm
|
python -m spacy download ca_core_news_sm
|
||||||
# python -m spacy download ca_core_news_md
|
python -m spacy download ca_core_news_md
|
||||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
# if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
#
|
|
||||||
# - name: "Test download_url in info CLI"
|
- name: "Test download_url in info CLI"
|
||||||
# run: |
|
run: |
|
||||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
# if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
#
|
|
||||||
# - name: "Test no warnings on load (#11713)"
|
- name: "Test no warnings on load (#11713)"
|
||||||
# run: |
|
run: |
|
||||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
# if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test convert CLI"
|
- name: "Test convert CLI"
|
||||||
run: |
|
run: |
|
||||||
|
@ -150,17 +156,17 @@ jobs:
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
# - name: "Test assemble CLI"
|
- name: "Test assemble CLI"
|
||||||
# run: |
|
run: |
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
# if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
#
|
|
||||||
# - name: "Test assemble CLI vectors warning"
|
- name: "Test assemble CLI vectors warning"
|
||||||
# run: |
|
run: |
|
||||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
# if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Install test requirements"
|
- name: "Install test requirements"
|
||||||
run: |
|
run: |
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -1,11 +1,11 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
override PYVER = 3.6
|
override PYVER = 3.8
|
||||||
endif
|
endif
|
||||||
|
|
||||||
VENV := ./env$(PYVER)
|
VENV := ./env$(PYVER)
|
||||||
|
|
72
README.md
72
README.md
|
@ -6,23 +6,20 @@ spaCy is a library for **advanced Natural Language Processing** in Python and
|
||||||
Cython. It's built on the very latest research, and was designed from day one to
|
Cython. It's built on the very latest research, and was designed from day one to
|
||||||
be used in real products.
|
be used in real products.
|
||||||
|
|
||||||
spaCy comes with
|
spaCy comes with [pretrained pipelines](https://spacy.io/models) and currently
|
||||||
[pretrained pipelines](https://spacy.io/models) and
|
supports tokenization and training for **70+ languages**. It features
|
||||||
currently supports tokenization and training for **70+ languages**. It features
|
state-of-the-art speed and **neural network models** for tagging, parsing,
|
||||||
state-of-the-art speed and **neural network models** for tagging,
|
**named entity recognition**, **text classification** and more, multi-task
|
||||||
parsing, **named entity recognition**, **text classification** and more,
|
learning with pretrained **transformers** like BERT, as well as a
|
||||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
|
||||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||||
model packaging, deployment and workflow management. spaCy is commercial
|
model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
open-source software, released under the
|
||||||
|
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
💥 **We'd love to hear more about your experience with spaCy!**
|
💫 **Version 3.6 out now!**
|
||||||
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
|
||||||
|
|
||||||
💫 **Version 3.5 out now!**
|
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
||||||
[](https://github.com/explosion/spaCy/releases)
|
[](https://github.com/explosion/spaCy/releases)
|
||||||
[](https://pypi.org/project/spacy/)
|
[](https://pypi.org/project/spacy/)
|
||||||
[](https://anaconda.org/conda-forge/spacy)
|
[](https://anaconda.org/conda-forge/spacy)
|
||||||
|
@ -35,22 +32,22 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| ----------------------------- | ---------------------------------------------------------------------- |
|
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
|
@ -58,7 +55,7 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
|
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
|
@ -92,7 +89,9 @@ more people can benefit from it.
|
||||||
- State-of-the-art speed
|
- State-of-the-art speed
|
||||||
- Production-ready **training system**
|
- Production-ready **training system**
|
||||||
- Linguistically-motivated **tokenization**
|
- Linguistically-motivated **tokenization**
|
||||||
- Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more
|
- Components for named **entity recognition**, part-of-speech-tagging,
|
||||||
|
dependency parsing, sentence segmentation, **text classification**,
|
||||||
|
lemmatization, morphological analysis, entity linking and more
|
||||||
- Easily extensible with **custom components** and attributes
|
- Easily extensible with **custom components** and attributes
|
||||||
- Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
|
- Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
|
||||||
- Built in **visualizers** for syntax and NER
|
- Built in **visualizers** for syntax and NER
|
||||||
|
@ -118,8 +117,8 @@ For detailed installation instructions, see the
|
||||||
### pip
|
### pip
|
||||||
|
|
||||||
Using pip, spaCy releases are available as source packages and binary wheels.
|
Using pip, spaCy releases are available as source packages and binary wheels.
|
||||||
Before you install spaCy and its dependencies, make sure that
|
Before you install spaCy and its dependencies, make sure that your `pip`,
|
||||||
your `pip`, `setuptools` and `wheel` are up to date.
|
`setuptools` and `wheel` are up to date.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -U pip setuptools wheel
|
pip install -U pip setuptools wheel
|
||||||
|
@ -174,9 +173,9 @@ with the new version.
|
||||||
|
|
||||||
## 📦 Download model packages
|
## 📦 Download model packages
|
||||||
|
|
||||||
Trained pipelines for spaCy can be installed as **Python packages**. This
|
Trained pipelines for spaCy can be installed as **Python packages**. This means
|
||||||
means that they're a component of your application, just like any other module.
|
that they're a component of your application, just like any other module. Models
|
||||||
Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
||||||
command, or manually by pointing pip to a path or URL.
|
command, or manually by pointing pip to a path or URL.
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
|
@ -242,8 +241,7 @@ do that depends on your system.
|
||||||
| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. |
|
| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. |
|
||||||
| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
|
| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
|
||||||
|
|
||||||
For more details
|
For more details and instructions, see the documentation on
|
||||||
and instructions, see the documentation on
|
|
||||||
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
||||||
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
||||||
commands for your platform and Python version.
|
commands for your platform and Python version.
|
||||||
|
|
|
@ -38,4 +38,5 @@ types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
|
cython-lint>=0.15.0; python_version >= "3.7"
|
||||||
isort>=5.0,<6.0
|
isort>=5.0,<6.0
|
||||||
|
|
31
setup.py
31
setup.py
|
@ -1,10 +1,9 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from setuptools import Extension, setup, find_packages
|
from setuptools import Extension, setup, find_packages
|
||||||
import sys
|
import sys
|
||||||
import platform
|
|
||||||
import numpy
|
import numpy
|
||||||
from distutils.command.build_ext import build_ext
|
from setuptools.command.build_ext import build_ext
|
||||||
from distutils.sysconfig import get_python_inc
|
from sysconfig import get_path
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
from Cython.Build import cythonize
|
from Cython.Build import cythonize
|
||||||
|
@ -88,30 +87,6 @@ COPY_FILES = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def is_new_osx():
|
|
||||||
"""Check whether we're on OSX >= 10.7"""
|
|
||||||
if sys.platform != "darwin":
|
|
||||||
return False
|
|
||||||
mac_ver = platform.mac_ver()[0]
|
|
||||||
if mac_ver.startswith("10"):
|
|
||||||
minor_version = int(mac_ver.split(".")[1])
|
|
||||||
if minor_version >= 7:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
if is_new_osx():
|
|
||||||
# On Mac, use libc++ because Apple deprecated use of
|
|
||||||
# libstdc
|
|
||||||
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
|
|
||||||
LINK_OPTIONS["other"].append("-lc++")
|
|
||||||
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
|
|
||||||
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
|
|
||||||
LINK_OPTIONS["other"].append("-nodefaultlibs")
|
|
||||||
|
|
||||||
|
|
||||||
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
||||||
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
||||||
class build_ext_options:
|
class build_ext_options:
|
||||||
|
@ -204,7 +179,7 @@ def setup_package():
|
||||||
|
|
||||||
include_dirs = [
|
include_dirs = [
|
||||||
numpy.get_include(),
|
numpy.get_include(),
|
||||||
get_python_inc(plat_specific=True),
|
get_path("include"),
|
||||||
]
|
]
|
||||||
ext_modules = []
|
ext_modules = []
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
|
|
|
@ -96,4 +96,4 @@ cdef enum attr_id_t:
|
||||||
ENT_ID = symbols.ENT_ID
|
ENT_ID = symbols.ENT_ID
|
||||||
|
|
||||||
IDX
|
IDX
|
||||||
SENT_END
|
SENT_END
|
||||||
|
|
|
@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
if "pos" in stringy_attrs:
|
if "pos" in stringy_attrs:
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||||
if "morph" in stringy_attrs:
|
if "morph" in stringy_attrs:
|
||||||
morphs = stringy_attrs.pop("morph")
|
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
||||||
if "number" in stringy_attrs:
|
if "number" in stringy_attrs:
|
||||||
stringy_attrs.pop("number")
|
stringy_attrs.pop("number")
|
||||||
if "tenspect" in stringy_attrs:
|
if "tenspect" in stringy_attrs:
|
||||||
|
|
|
@ -32,6 +32,7 @@ def init_vectors_cli(
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
@ -50,6 +51,7 @@ def init_vectors_cli(
|
||||||
prune=prune,
|
prune=prune,
|
||||||
name=name,
|
name=name,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
|
attr=attr,
|
||||||
)
|
)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
|
|
@ -130,7 +130,7 @@ grad_factor = 1.0
|
||||||
{% if "span_finder" in components -%}
|
{% if "span_finder" in components -%}
|
||||||
[components.span_finder]
|
[components.span_finder]
|
||||||
factory = "span_finder"
|
factory = "span_finder"
|
||||||
max_length = null
|
max_length = 25
|
||||||
min_length = null
|
min_length = null
|
||||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
spans_key = "sc"
|
spans_key = "sc"
|
||||||
|
@ -419,7 +419,7 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
{% if "span_finder" in components %}
|
{% if "span_finder" in components %}
|
||||||
[components.span_finder]
|
[components.span_finder]
|
||||||
factory = "span_finder"
|
factory = "span_finder"
|
||||||
max_length = null
|
max_length = 25
|
||||||
min_length = null
|
min_length = null
|
||||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
spans_key = "sc"
|
spans_key = "sc"
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import itertools
|
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
@ -218,7 +217,7 @@ class SpanRenderer:
|
||||||
+ (self.offset_step * (len(entities) - 1))
|
+ (self.offset_step * (len(entities) - 1))
|
||||||
)
|
)
|
||||||
markup += self.span_template.format(
|
markup += self.span_template.format(
|
||||||
text=token["text"],
|
text=escape_html(token["text"]),
|
||||||
span_slices=slices,
|
span_slices=slices,
|
||||||
span_starts=starts,
|
span_starts=starts,
|
||||||
total_height=total_height,
|
total_height=total_height,
|
||||||
|
|
|
@ -216,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
||||||
|
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||||
|
"'spacy init vectors --attr'")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
|
|
@ -4,7 +4,8 @@ from ..typedefs cimport hash_t
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
# Object used by the Entity Linker that summarizes one entity-alias candidate
|
||||||
|
# combination.
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
cdef readonly KnowledgeBase kb
|
cdef readonly KnowledgeBase kb
|
||||||
cdef hash_t entity_hash
|
cdef hash_t entity_hash
|
||||||
|
|
|
@ -8,15 +8,24 @@ from ..tokens import Span
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
may not be resolved to a specific `entity` from a Knowledge Base. This
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
will be used as input for the entity linking algorithm which will
|
||||||
|
disambiguate the various candidates to the correct one.
|
||||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
def __init__(
|
||||||
|
self,
|
||||||
|
KnowledgeBase kb,
|
||||||
|
entity_hash,
|
||||||
|
entity_freq,
|
||||||
|
entity_vector,
|
||||||
|
alias_hash,
|
||||||
|
prior_prob
|
||||||
|
):
|
||||||
self.kb = kb
|
self.kb = kb
|
||||||
self.entity_hash = entity_hash
|
self.entity_hash = entity_hash
|
||||||
self.entity_freq = entity_freq
|
self.entity_freq = entity_freq
|
||||||
|
@ -59,7 +68,8 @@ cdef class Candidate:
|
||||||
|
|
||||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
Return candidate entities for a given mention and fetching appropriate
|
||||||
|
entries from the index.
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
mention (Span): Entity mention for which to identify candidates.
|
mention (Span): Entity mention for which to identify candidates.
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
@ -67,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
return kb.get_candidates(mention)
|
return kb.get_candidates(mention)
|
||||||
|
|
||||||
|
|
||||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(
|
||||||
|
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||||
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
Return candidate entities for the given mentions and fetching appropriate entries
|
||||||
|
from the index.
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
|
|
@ -12,8 +12,9 @@ from .candidate import Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
||||||
to support entity linking of named entities to real-world concepts.
|
their textual aliases, to support entity linking of named entities to
|
||||||
|
real-world concepts.
|
||||||
This is an abstract class and requires its operations to be implemented.
|
This is an abstract class and requires its operations to be implemented.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://spacy.io/api/kb
|
||||||
|
@ -31,10 +32,13 @@ cdef class KnowledgeBase:
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
||||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(
|
||||||
|
self, mentions: Iterable[Span]
|
||||||
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
Return candidate entities for specified texts. Each candidate defines
|
||||||
and the prior probability of that alias resolving to that entity.
|
the entity, the original alias, and the prior probability of that
|
||||||
|
alias resolving to that entity.
|
||||||
If no candidate is found for a given text, an empty list is returned.
|
If no candidate is found for a given text, an empty list is returned.
|
||||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
@ -43,14 +47,17 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
Return candidate entities for specified text. Each candidate defines
|
||||||
|
the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
If the no candidate is found for a given text, an empty list is returned.
|
If the no candidate is found for a given text, an empty list is returned.
|
||||||
mention (Span): Mention for which to get candidates.
|
mention (Span): Mention for which to get candidates.
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||||
|
@ -68,7 +75,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (Iterable[float]): Vector for specified entity.
|
RETURNS (Iterable[float]): Vector for specified entity.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_bytes(self, **kwargs) -> bytes:
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
@ -76,7 +85,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (bytes): Current state as binary string.
|
RETURNS (bytes): Current state as binary string.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||||
|
@ -85,25 +96,35 @@ cdef class KnowledgeBase:
|
||||||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
def to_disk(
|
||||||
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Write KnowledgeBase content to disk.
|
Write KnowledgeBase content to disk.
|
||||||
path (Union[str, Path]): Target file path.
|
path (Union[str, Path]): Target file path.
|
||||||
exclude (Iterable[str]): List of components to exclude.
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
def from_disk(
|
||||||
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Load KnowledgeBase content from disk.
|
Load KnowledgeBase content from disk.
|
||||||
path (Union[str, Path]): Target file path.
|
path (Union[str, Path]): Target file path.
|
||||||
exclude (Iterable[str]): List of components to exclude.
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
# optional data, we can let users configure a DB as the backend for this.
|
# optional data, we can let users configure a DB as the backend for this.
|
||||||
cdef object _features_table
|
cdef object _features_table
|
||||||
|
|
||||||
|
|
||||||
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
||||||
"""Add an entity vector to the vectors table."""
|
"""Add an entity vector to the vectors table."""
|
||||||
cdef int64_t new_index = self._vectors_table.size()
|
cdef int64_t new_index = self._vectors_table.size()
|
||||||
self._vectors_table.push_back(entity_vector)
|
self._vectors_table.push_back(entity_vector)
|
||||||
return new_index
|
return new_index
|
||||||
|
|
||||||
|
cdef inline int64_t c_add_entity(
|
||||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
self,
|
||||||
int32_t vector_index, int feats_row) nogil:
|
hash_t entity_hash,
|
||||||
|
float freq,
|
||||||
|
int32_t vector_index,
|
||||||
|
int feats_row
|
||||||
|
) nogil:
|
||||||
"""Add an entry to the vector of entries.
|
"""Add an entry to the vector of entries.
|
||||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
After calling this method, make sure to update also the _entry_index
|
||||||
|
using the return value"""
|
||||||
# This is what we'll map the entity hash key to. It's where the entry will sit
|
# This is what we'll map the entity hash key to. It's where the entry will sit
|
||||||
# in the vector of entries, so we can get it later.
|
# in the vector of entries, so we can get it later.
|
||||||
cdef int64_t new_index = self._entries.size()
|
cdef int64_t new_index = self._entries.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
# Avoid struct initializer to enable nogil, cf.
|
||||||
|
# https://github.com/cython/cython/issues/1642
|
||||||
cdef KBEntryC entry
|
cdef KBEntryC entry
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.vector_index = vector_index
|
entry.vector_index = vector_index
|
||||||
|
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
self._entries.push_back(entry)
|
self._entries.push_back(entry)
|
||||||
return new_index
|
return new_index
|
||||||
|
|
||||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
|
cdef inline int64_t c_add_aliases(
|
||||||
"""Connect a mention to a list of potential entities with their prior probabilities .
|
self,
|
||||||
After calling this method, make sure to update also the _alias_index using the return value"""
|
hash_t alias_hash,
|
||||||
# This is what we'll map the alias hash key to. It's where the alias will be defined
|
vector[int64_t] entry_indices,
|
||||||
# in the vector of aliases.
|
vector[float] probs
|
||||||
|
) nogil:
|
||||||
|
"""Connect a mention to a list of potential entities with their prior
|
||||||
|
probabilities. After calling this method, make sure to update also the
|
||||||
|
_alias_index using the return value"""
|
||||||
|
# This is what we'll map the alias hash key to. It's where the alias will be
|
||||||
|
# defined in the vector of aliases.
|
||||||
cdef int64_t new_index = self._aliases_table.size()
|
cdef int64_t new_index = self._aliases_table.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil
|
# Avoid struct initializer to enable nogil
|
||||||
|
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
||||||
"""
|
"""
|
||||||
Initializing the vectors and making sure the first element of each vector is a dummy,
|
Initializing the vectors and making sure the first element of each vector is a
|
||||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
dummy, because the PreshMap maps pointing to indices in these vectors can not
|
||||||
|
contain 0 as value.
|
||||||
cf. https://github.com/explosion/preshed/issues/17
|
cf. https://github.com/explosion/preshed/issues/17
|
||||||
"""
|
"""
|
||||||
cdef int32_t dummy_value = 0
|
cdef int32_t dummy_value = 0
|
||||||
|
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
cdef class Writer:
|
cdef class Writer:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
cdef int write_header(
|
||||||
|
self, int64_t nr_entries, int64_t entity_vector_length
|
||||||
|
) except -1
|
||||||
cdef int write_vector_element(self, float element) except -1
|
cdef int write_vector_element(self, float element) except -1
|
||||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
cdef int write_entry(
|
||||||
|
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||||
|
) except -1
|
||||||
|
|
||||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
cdef int write_alias_header(
|
||||||
|
self, hash_t alias_hash, int64_t candidate_length
|
||||||
|
) except -1
|
||||||
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
||||||
|
|
||||||
cdef int _write(self, void* value, size_t size) except -1
|
cdef int _write(self, void* value, size_t size) except -1
|
||||||
|
@ -143,12 +161,18 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
cdef int read_header(
|
||||||
|
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||||
|
) except -1
|
||||||
cdef int read_vector_element(self, float* element) except -1
|
cdef int read_vector_element(self, float* element) except -1
|
||||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
cdef int read_entry(
|
||||||
|
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||||
|
) except -1
|
||||||
|
|
||||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
cdef int read_alias_header(
|
||||||
|
self, hash_t* alias_hash, int64_t* candidate_length
|
||||||
|
) except -1
|
||||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||||
|
|
||||||
cdef int _read(self, void* value, size_t size) except -1
|
cdef int _read(self, void* value, size_t size) except -1
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Any, Callable, Dict, Iterable, Union
|
from typing import Any, Callable, Dict, Iterable
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -27,8 +27,9 @@ from .candidate import Candidate as Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
"""An `InMemoryLookupKB` instance stores unique identifiers for entities
|
||||||
to support entity linking of named entities to real-world concepts.
|
and their textual aliases, to support entity linking of named entities to
|
||||||
|
real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||||
"""
|
"""
|
||||||
|
@ -71,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability
|
||||||
|
based on corpus frequency.
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
"""
|
"""
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||||
|
@ -83,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
# Raise an error if the provided entity vector is not of the correct length
|
# Raise an error if the provided entity vector is not of the correct length
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
raise ValueError(
|
||||||
|
Errors.E141.format(
|
||||||
|
found=len(entity_vector), required=self.entity_vector_length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||||
|
|
||||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
new_index = self.c_add_entity(
|
||||||
freq=freq,
|
entity_hash=entity_hash,
|
||||||
vector_index=vector_index,
|
freq=freq,
|
||||||
feats_row=-1) # Features table currently not implemented
|
vector_index=vector_index,
|
||||||
|
feats_row=-1
|
||||||
|
) # Features table currently not implemented
|
||||||
self._entry_index[entity_hash] = new_index
|
self._entry_index[entity_hash] = new_index
|
||||||
|
|
||||||
return entity_hash
|
return entity_hash
|
||||||
|
@ -115,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
else:
|
else:
|
||||||
entity_vector = vector_list[i]
|
entity_vector = vector_list[i]
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
raise ValueError(
|
||||||
|
Errors.E141.format(
|
||||||
|
found=len(entity_vector),
|
||||||
|
required=self.entity_vector_length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.freq = freq_list[i]
|
entry.freq = freq_list[i]
|
||||||
|
@ -149,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
previous_alias_nr = self.get_size_aliases()
|
previous_alias_nr = self.get_size_aliases()
|
||||||
# Throw an error if the length of entities and probabilities are not the same
|
# Throw an error if the length of entities and probabilities are not the same
|
||||||
if not len(entities) == len(probabilities):
|
if not len(entities) == len(probabilities):
|
||||||
raise ValueError(Errors.E132.format(alias=alias,
|
raise ValueError(
|
||||||
entities_length=len(entities),
|
Errors.E132.format(
|
||||||
probabilities_length=len(probabilities)))
|
alias=alias,
|
||||||
|
entities_length=len(entities),
|
||||||
|
probabilities_length=len(probabilities))
|
||||||
|
)
|
||||||
|
|
||||||
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
|
# Throw an error if the probabilities sum up to more than 1 (allow for
|
||||||
|
# some rounding errors)
|
||||||
prob_sum = sum(probabilities)
|
prob_sum = sum(probabilities)
|
||||||
if prob_sum > 1.00001:
|
if prob_sum > 1.00001:
|
||||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||||
|
@ -170,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
for entity, prob in zip(entities, probabilities):
|
for entity, prob in zip(entities, probabilities):
|
||||||
entity_hash = self.vocab.strings[entity]
|
entity_hash = self.vocab.strings[entity]
|
||||||
if not entity_hash in self._entry_index:
|
if entity_hash not in self._entry_index:
|
||||||
raise ValueError(Errors.E134.format(entity=entity))
|
raise ValueError(Errors.E134.format(entity=entity))
|
||||||
|
|
||||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||||
entry_indices.push_back(int(entry_index))
|
entry_indices.push_back(int(entry_index))
|
||||||
probs.push_back(float(prob))
|
probs.push_back(float(prob))
|
||||||
|
|
||||||
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
|
new_index = self.c_add_aliases(
|
||||||
|
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
|
||||||
|
)
|
||||||
self._alias_index[alias_hash] = new_index
|
self._alias_index[alias_hash] = new_index
|
||||||
|
|
||||||
if previous_alias_nr + 1 != self.get_size_aliases():
|
if previous_alias_nr + 1 != self.get_size_aliases():
|
||||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||||
return alias_hash
|
return alias_hash
|
||||||
|
|
||||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
def append_alias(
|
||||||
|
self, str alias, str entity, float prior_prob, ignore_warnings=False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
For an alias already existing in the KB, extend its potential entities with one more.
|
For an alias already existing in the KB, extend its potential entities
|
||||||
|
with one more.
|
||||||
Throw a warning if either the alias or the entity is unknown,
|
Throw a warning if either the alias or the entity is unknown,
|
||||||
or when the combination is already previously recorded.
|
or when the combination is already previously recorded.
|
||||||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
For efficiency, it's best to use the method `add_alias` as much as
|
||||||
|
possible instead of this one.
|
||||||
"""
|
"""
|
||||||
# Check if the alias exists in the KB
|
# Check if the alias exists in the KB
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
if not alias_hash in self._alias_index:
|
if alias_hash not in self._alias_index:
|
||||||
raise ValueError(Errors.E176.format(alias=alias))
|
raise ValueError(Errors.E176.format(alias=alias))
|
||||||
|
|
||||||
# Check if the entity exists in the KB
|
# Check if the entity exists in the KB
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
if not entity_hash in self._entry_index:
|
if entity_hash not in self._entry_index:
|
||||||
raise ValueError(Errors.E134.format(entity=entity))
|
raise ValueError(Errors.E134.format(entity=entity))
|
||||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||||
|
|
||||||
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
|
# Throw an error if the prior probabilities (including the new one)
|
||||||
|
# sum up to more than 1
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
current_sum = sum([p for p in alias_entry.probs])
|
current_sum = sum([p for p in alias_entry.probs])
|
||||||
|
@ -236,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity, the original alias, and the prior probability of that alias
|
||||||
|
resolving to that entity.
|
||||||
If the alias is not known in the KB, and empty list is returned.
|
If the alias is not known in the KB, and empty list is returned.
|
||||||
"""
|
"""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
if not alias_hash in self._alias_index:
|
if alias_hash not in self._alias_index:
|
||||||
return []
|
return []
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
|
@ -249,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
return [Candidate(kb=self,
|
return [Candidate(kb=self,
|
||||||
entity_hash=self._entries[entry_index].entity_hash,
|
entity_hash=self._entries[entry_index].entity_hash,
|
||||||
entity_freq=self._entries[entry_index].freq,
|
entity_freq=self._entries[entry_index].freq,
|
||||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
entity_vector=self._vectors_table[
|
||||||
|
self._entries[entry_index].vector_index
|
||||||
|
],
|
||||||
alias_hash=alias_hash,
|
alias_hash=alias_hash,
|
||||||
prior_prob=prior_prob)
|
prior_prob=prior_prob)
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(
|
||||||
|
alias_entry.entry_indices, alias_entry.probs
|
||||||
|
)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
def get_vector(self, str entity):
|
def get_vector(self, str entity):
|
||||||
|
@ -266,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||||
|
|
||||||
def get_prior_prob(self, str entity, str alias):
|
def get_prior_prob(self, str entity, str alias):
|
||||||
""" Return the prior probability of a given alias being linked to a given entity,
|
""" Return the prior probability of a given alias being linked to a
|
||||||
or return 0.0 when this combination is not known in the knowledge base"""
|
given entity, or return 0.0 when this combination is not known in the
|
||||||
|
knowledge base."""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
|
||||||
|
@ -278,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
entry_index = self._entry_index[entity_hash]
|
entry_index = self._entry_index[entity_hash]
|
||||||
|
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
|
for (entry_index, prior_prob) in zip(
|
||||||
|
alias_entry.entry_indices, alias_entry.probs
|
||||||
|
):
|
||||||
if self._entries[entry_index].entity_hash == entity_hash:
|
if self._entries[entry_index].entity_hash == entity_hash:
|
||||||
return prior_prob
|
return prior_prob
|
||||||
|
|
||||||
|
@ -288,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
"""
|
"""
|
||||||
def serialize_header():
|
def serialize_header():
|
||||||
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
header = (
|
||||||
|
self.get_size_entities(),
|
||||||
|
self.get_size_aliases(),
|
||||||
|
self.entity_vector_length
|
||||||
|
)
|
||||||
return srsly.json_dumps(header)
|
return srsly.json_dumps(header)
|
||||||
|
|
||||||
def serialize_entries():
|
def serialize_entries():
|
||||||
i = 1
|
i = 1
|
||||||
tuples = []
|
tuples = []
|
||||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
for entry_hash, entry_index in sorted(
|
||||||
|
self._entry_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
assert entry.entity_hash == entry_hash
|
assert entry.entity_hash == entry_hash
|
||||||
assert entry_index == i
|
assert entry_index == i
|
||||||
|
@ -307,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
headers = []
|
headers = []
|
||||||
indices_lists = []
|
indices_lists = []
|
||||||
probs_lists = []
|
probs_lists = []
|
||||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
for alias_hash, alias_index in sorted(
|
||||||
|
self._alias_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
alias = self._aliases_table[alias_index]
|
alias = self._aliases_table[alias_index]
|
||||||
assert alias_index == i
|
assert alias_index == i
|
||||||
candidate_length = len(alias.entry_indices)
|
candidate_length = len(alias.entry_indices)
|
||||||
|
@ -365,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
indices = srsly.json_loads(all_data[1])
|
indices = srsly.json_loads(all_data[1])
|
||||||
probs = srsly.json_loads(all_data[2])
|
probs = srsly.json_loads(all_data[2])
|
||||||
for header, indices, probs in zip(headers, indices, probs):
|
for header, indices, probs in zip(headers, indices, probs):
|
||||||
alias_hash, candidate_length = header
|
alias_hash, _candidate_length = header
|
||||||
alias.entry_indices = indices
|
alias.entry_indices = indices
|
||||||
alias.probs = probs
|
alias.probs = probs
|
||||||
self._aliases_table[i] = alias
|
self._aliases_table[i] = alias
|
||||||
|
@ -414,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
writer.write_vector_element(element)
|
writer.write_vector_element(element)
|
||||||
i = i+1
|
i = i+1
|
||||||
|
|
||||||
# dumping the entry records in the order in which they are in the _entries vector.
|
# dumping the entry records in the order in which they are in the
|
||||||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
# _entries vector.
|
||||||
|
# index 0 is a dummy object not stored in the _entry_index and can
|
||||||
|
# be ignored.
|
||||||
i = 1
|
i = 1
|
||||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
for entry_hash, entry_index in sorted(
|
||||||
|
self._entry_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
assert entry.entity_hash == entry_hash
|
assert entry.entity_hash == entry_hash
|
||||||
assert entry_index == i
|
assert entry_index == i
|
||||||
|
@ -429,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
# dumping the aliases in the order in which they are in the _alias_index vector.
|
# dumping the aliases in the order in which they are in the _alias_index vector.
|
||||||
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
||||||
i = 1
|
i = 1
|
||||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
for alias_hash, alias_index in sorted(
|
||||||
|
self._alias_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
alias = self._aliases_table[alias_index]
|
alias = self._aliases_table[alias_index]
|
||||||
assert alias_index == i
|
assert alias_index == i
|
||||||
|
|
||||||
|
@ -535,7 +581,8 @@ cdef class Writer:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
cdef bytes bytes_loc = content.encode('utf8') \
|
||||||
|
if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=path))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
|
@ -545,14 +592,18 @@ cdef class Writer:
|
||||||
cdef size_t status = fclose(self._fp)
|
cdef size_t status = fclose(self._fp)
|
||||||
assert status == 0
|
assert status == 0
|
||||||
|
|
||||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
|
cdef int write_header(
|
||||||
|
self, int64_t nr_entries, int64_t entity_vector_length
|
||||||
|
) except -1:
|
||||||
self._write(&nr_entries, sizeof(nr_entries))
|
self._write(&nr_entries, sizeof(nr_entries))
|
||||||
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
||||||
|
|
||||||
cdef int write_vector_element(self, float element) except -1:
|
cdef int write_vector_element(self, float element) except -1:
|
||||||
self._write(&element, sizeof(element))
|
self._write(&element, sizeof(element))
|
||||||
|
|
||||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
cdef int write_entry(
|
||||||
|
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||||
|
) except -1:
|
||||||
self._write(&entry_hash, sizeof(entry_hash))
|
self._write(&entry_hash, sizeof(entry_hash))
|
||||||
self._write(&entry_freq, sizeof(entry_freq))
|
self._write(&entry_freq, sizeof(entry_freq))
|
||||||
self._write(&vector_index, sizeof(vector_index))
|
self._write(&vector_index, sizeof(vector_index))
|
||||||
|
@ -561,7 +612,9 @@ cdef class Writer:
|
||||||
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
||||||
self._write(&alias_length, sizeof(alias_length))
|
self._write(&alias_length, sizeof(alias_length))
|
||||||
|
|
||||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
|
cdef int write_alias_header(
|
||||||
|
self, hash_t alias_hash, int64_t candidate_length
|
||||||
|
) except -1:
|
||||||
self._write(&alias_hash, sizeof(alias_hash))
|
self._write(&alias_hash, sizeof(alias_hash))
|
||||||
self._write(&candidate_length, sizeof(candidate_length))
|
self._write(&candidate_length, sizeof(candidate_length))
|
||||||
|
|
||||||
|
@ -577,16 +630,19 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
cdef bytes bytes_loc = content.encode('utf8') \
|
||||||
|
if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
fclose(self._fp)
|
fclose(self._fp)
|
||||||
|
|
||||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
|
cdef int read_header(
|
||||||
|
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||||
|
) except -1:
|
||||||
status = self._read(nr_entries, sizeof(int64_t))
|
status = self._read(nr_entries, sizeof(int64_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -606,7 +662,9 @@ cdef class Reader:
|
||||||
return 0 # end of file
|
return 0 # end of file
|
||||||
raise IOError(Errors.E145.format(param="vector element"))
|
raise IOError(Errors.E145.format(param="vector element"))
|
||||||
|
|
||||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
cdef int read_entry(
|
||||||
|
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||||
|
) except -1:
|
||||||
status = self._read(entity_hash, sizeof(hash_t))
|
status = self._read(entity_hash, sizeof(hash_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -637,7 +695,9 @@ cdef class Reader:
|
||||||
return 0 # end of file
|
return 0 # end of file
|
||||||
raise IOError(Errors.E145.format(param="alias length"))
|
raise IOError(Errors.E145.format(param="alias length"))
|
||||||
|
|
||||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
|
cdef int read_alias_header(
|
||||||
|
self, hash_t* alias_hash, int64_t* candidate_length
|
||||||
|
) except -1:
|
||||||
status = self._read(alias_hash, sizeof(hash_t))
|
status = self._read(alias_hash, sizeof(hash_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
|
|
@ -739,6 +739,11 @@ class Language:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
pipe = source.get_pipe(source_name)
|
pipe = source.get_pipe(source_name)
|
||||||
|
# There is no actual solution here. Either the component has the right
|
||||||
|
# name for the source pipeline or the component has the right name for
|
||||||
|
# the current pipeline. This prioritizes the current pipeline.
|
||||||
|
if hasattr(pipe, "name"):
|
||||||
|
pipe.name = name
|
||||||
# Make sure the source config is interpolated so we don't end up with
|
# Make sure the source config is interpolated so we don't end up with
|
||||||
# orphaned variables in our final config
|
# orphaned variables in our final config
|
||||||
source_config = source.config.interpolate()
|
source_config = source.config.interpolate()
|
||||||
|
@ -816,6 +821,7 @@ class Language:
|
||||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||||
self._components.insert(pipe_index, (name, pipe_component))
|
self._components.insert(pipe_index, (name, pipe_component))
|
||||||
|
self._link_components()
|
||||||
return pipe_component
|
return pipe_component
|
||||||
|
|
||||||
def _get_pipe_index(
|
def _get_pipe_index(
|
||||||
|
@ -951,6 +957,7 @@ class Language:
|
||||||
if old_name in self._config["initialize"]["components"]:
|
if old_name in self._config["initialize"]["components"]:
|
||||||
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||||||
self._config["initialize"]["components"][new_name] = init_cfg
|
self._config["initialize"]["components"][new_name] = init_cfg
|
||||||
|
self._link_components()
|
||||||
|
|
||||||
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
||||||
"""Remove a component from the pipeline.
|
"""Remove a component from the pipeline.
|
||||||
|
@ -974,6 +981,7 @@ class Language:
|
||||||
# Make sure the name is also removed from the set of disabled components
|
# Make sure the name is also removed from the set of disabled components
|
||||||
if name in self.disabled:
|
if name in self.disabled:
|
||||||
self._disabled.remove(name)
|
self._disabled.remove(name)
|
||||||
|
self._link_components()
|
||||||
return removed
|
return removed
|
||||||
|
|
||||||
def disable_pipe(self, name: str) -> None:
|
def disable_pipe(self, name: str) -> None:
|
||||||
|
@ -1702,8 +1710,16 @@ class Language:
|
||||||
# The problem is we need to do it during deserialization...And the
|
# The problem is we need to do it during deserialization...And the
|
||||||
# components don't receive the pipeline then. So this does have to be
|
# components don't receive the pipeline then. So this does have to be
|
||||||
# here :(
|
# here :(
|
||||||
|
# First, fix up all the internal component names in case they have
|
||||||
|
# gotten out of sync due to sourcing components from different
|
||||||
|
# pipelines, since find_listeners uses proc2.name for the listener
|
||||||
|
# map.
|
||||||
|
for name, proc in self.pipeline:
|
||||||
|
if hasattr(proc, "name"):
|
||||||
|
proc.name = name
|
||||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||||
if isinstance(proc1, ty.ListenedToComponent):
|
if isinstance(proc1, ty.ListenedToComponent):
|
||||||
|
proc1.listener_map = {}
|
||||||
for name2, proc2 in self.pipeline[i + 1 :]:
|
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||||
proc1.find_listeners(proc2)
|
proc1.find_listeners(proc2)
|
||||||
|
|
||||||
|
@ -1809,7 +1825,6 @@ class Language:
|
||||||
# Later we replace the component config with the raw config again.
|
# Later we replace the component config with the raw config again.
|
||||||
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||||||
pipeline = interpolated.get("components", {})
|
pipeline = interpolated.get("components", {})
|
||||||
sourced = util.get_sourced_components(interpolated)
|
|
||||||
# If components are loaded from a source (existing models), we cache
|
# If components are loaded from a source (existing models), we cache
|
||||||
# them here so they're only loaded once
|
# them here so they're only loaded once
|
||||||
source_nlps = {}
|
source_nlps = {}
|
||||||
|
@ -1837,6 +1852,7 @@ class Language:
|
||||||
raw_config=raw_config,
|
raw_config=raw_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
assert "source" in pipe_cfg
|
||||||
# We need the sourced components to reference the same
|
# We need the sourced components to reference the same
|
||||||
# vocab without modifying the current vocab state **AND**
|
# vocab without modifying the current vocab state **AND**
|
||||||
# we still want to load the source model vectors to perform
|
# we still want to load the source model vectors to perform
|
||||||
|
@ -1856,6 +1872,10 @@ class Language:
|
||||||
source_name = pipe_cfg.get("component", pipe_name)
|
source_name = pipe_cfg.get("component", pipe_name)
|
||||||
listeners_replaced = False
|
listeners_replaced = False
|
||||||
if "replace_listeners" in pipe_cfg:
|
if "replace_listeners" in pipe_cfg:
|
||||||
|
# Make sure that the listened-to component has the
|
||||||
|
# state of the source pipeline listener map so that the
|
||||||
|
# replace_listeners method below works as intended.
|
||||||
|
source_nlps[model]._link_components()
|
||||||
for name, proc in source_nlps[model].pipeline:
|
for name, proc in source_nlps[model].pipeline:
|
||||||
if source_name in getattr(proc, "listening_components", []):
|
if source_name in getattr(proc, "listening_components", []):
|
||||||
source_nlps[model].replace_listeners(
|
source_nlps[model].replace_listeners(
|
||||||
|
@ -1867,6 +1887,8 @@ class Language:
|
||||||
nlp.add_pipe(
|
nlp.add_pipe(
|
||||||
source_name, source=source_nlps[model], name=pipe_name
|
source_name, source=source_nlps[model], name=pipe_name
|
||||||
)
|
)
|
||||||
|
# At this point after nlp.add_pipe, the listener map
|
||||||
|
# corresponds to the new pipeline.
|
||||||
if model not in source_nlp_vectors_hashes:
|
if model not in source_nlp_vectors_hashes:
|
||||||
source_nlp_vectors_hashes[model] = hash(
|
source_nlp_vectors_hashes[model] = hash(
|
||||||
source_nlps[model].vocab.vectors.to_bytes(
|
source_nlps[model].vocab.vectors.to_bytes(
|
||||||
|
@ -1921,27 +1943,6 @@ class Language:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
Errors.E942.format(name="pipeline_creation", value=type(nlp))
|
Errors.E942.format(name="pipeline_creation", value=type(nlp))
|
||||||
)
|
)
|
||||||
# Detect components with listeners that are not frozen consistently
|
|
||||||
for name, proc in nlp.pipeline:
|
|
||||||
if isinstance(proc, ty.ListenedToComponent):
|
|
||||||
# Remove listeners not in the pipeline
|
|
||||||
listener_names = proc.listening_components
|
|
||||||
unused_listener_names = [
|
|
||||||
ll for ll in listener_names if ll not in nlp.pipe_names
|
|
||||||
]
|
|
||||||
for listener_name in unused_listener_names:
|
|
||||||
for listener in proc.listener_map.get(listener_name, []):
|
|
||||||
proc.remove_listener(listener, listener_name)
|
|
||||||
|
|
||||||
for listener_name in proc.listening_components:
|
|
||||||
# e.g. tok2vec/transformer
|
|
||||||
# If it's a component sourced from another pipeline, we check if
|
|
||||||
# the tok2vec listeners should be replaced with standalone tok2vec
|
|
||||||
# models (e.g. so component can be frozen without its performance
|
|
||||||
# degrading when other components/tok2vec are updated)
|
|
||||||
paths = sourced.get(listener_name, {}).get("replace_listeners", [])
|
|
||||||
if paths:
|
|
||||||
nlp.replace_listeners(name, listener_name, paths)
|
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
def replace_listeners(
|
def replace_listeners(
|
||||||
|
@ -1956,7 +1957,7 @@ class Language:
|
||||||
useful when training a pipeline with components sourced from an existing
|
useful when training a pipeline with components sourced from an existing
|
||||||
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
|
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
|
||||||
the same tok2vec component, but some of them are frozen and not updated,
|
the same tok2vec component, but some of them are frozen and not updated,
|
||||||
their performance may degrade significally as the tok2vec component is
|
their performance may degrade significantly as the tok2vec component is
|
||||||
updated with new data. To prevent this, listeners can be replaced with
|
updated with new data. To prevent this, listeners can be replaced with
|
||||||
a standalone tok2vec layer that is owned by the component and doesn't
|
a standalone tok2vec layer that is owned by the component and doesn't
|
||||||
change if the component isn't updated.
|
change if the component isn't updated.
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# cython: embedsignature=True
|
# cython: embedsignature=True
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from cython.view cimport array as cvarray
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
@ -35,7 +34,7 @@ from .typedefs cimport attr_t, flags_t
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
|
|
||||||
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
|
OOV_RANK = 0xffffffffffffffff # UINT64_MAX
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
EMPTY_LEXEME.id = OOV_RANK
|
EMPTY_LEXEME.id = OOV_RANK
|
||||||
|
|
||||||
|
@ -105,7 +104,7 @@ cdef class Lexeme:
|
||||||
if isinstance(value, float):
|
if isinstance(value, float):
|
||||||
continue
|
continue
|
||||||
elif isinstance(value, (int, long)):
|
elif isinstance(value, (int, long)):
|
||||||
Lexeme.set_struct_attr(self.c, attr, value)
|
Lexeme.set_struct_attr(self.c, attr, value)
|
||||||
else:
|
else:
|
||||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||||
|
|
||||||
|
@ -137,10 +136,12 @@ cdef class Lexeme:
|
||||||
if hasattr(other, "orth"):
|
if hasattr(other, "orth"):
|
||||||
if self.c.orth == other.orth:
|
if self.c.orth == other.orth:
|
||||||
return 1.0
|
return 1.0
|
||||||
elif hasattr(other, "__len__") and len(other) == 1 \
|
elif (
|
||||||
and hasattr(other[0], "orth"):
|
hasattr(other, "__len__") and len(other) == 1
|
||||||
if self.c.orth == other[0].orth:
|
and hasattr(other[0], "orth")
|
||||||
return 1.0
|
and self.c.orth == other[0].orth
|
||||||
|
):
|
||||||
|
return 1.0
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
warnings.warn(Warnings.W008.format(obj="Lexeme"))
|
warnings.warn(Warnings.W008.format(obj="Lexeme"))
|
||||||
return 0.0
|
return 0.0
|
||||||
|
@ -149,7 +150,7 @@ cdef class Lexeme:
|
||||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||||
return result.item()
|
return result.item()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def has_vector(self):
|
def has_vector(self):
|
||||||
"""RETURNS (bool): Whether a word vector is associated with the object.
|
"""RETURNS (bool): Whether a word vector is associated with the object.
|
||||||
|
|
|
@ -108,7 +108,7 @@ cdef class DependencyMatcher:
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
"""
|
"""
|
||||||
return self.has_key(key)
|
return self.has_key(key) # no-cython-lint: W601
|
||||||
|
|
||||||
def _validate_input(self, pattern, key):
|
def _validate_input(self, pattern, key):
|
||||||
idx = 0
|
idx = 0
|
||||||
|
@ -264,7 +264,7 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
def remove(self, key):
|
def remove(self, key):
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
if not key in self._patterns:
|
if key not in self._patterns:
|
||||||
raise ValueError(Errors.E175.format(key=key))
|
raise ValueError(Errors.E175.format(key=key))
|
||||||
self._patterns.pop(key)
|
self._patterns.pop(key)
|
||||||
self._raw_patterns.pop(key)
|
self._raw_patterns.pop(key)
|
||||||
|
@ -382,7 +382,7 @@ cdef class DependencyMatcher:
|
||||||
return []
|
return []
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
|
|
||||||
def _gov(self,doc,node):
|
def _gov(self, doc, node):
|
||||||
return list(doc[node].children)
|
return list(doc[node].children)
|
||||||
|
|
||||||
def _dep_chain(self, doc, node):
|
def _dep_chain(self, doc, node):
|
||||||
|
@ -443,7 +443,7 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
def _right_child(self, doc, node):
|
def _right_child(self, doc, node):
|
||||||
return [child for child in doc[node].rights]
|
return [child for child in doc[node].rights]
|
||||||
|
|
||||||
def _left_child(self, doc, node):
|
def _left_child(self, doc, node):
|
||||||
return [child for child in doc[node].lefts]
|
return [child for child in doc[node].lefts]
|
||||||
|
|
||||||
|
@ -461,7 +461,7 @@ cdef class DependencyMatcher:
|
||||||
if doc[node].head.i > node:
|
if doc[node].head.i > node:
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _left_parent(self, doc, node):
|
def _left_parent(self, doc, node):
|
||||||
if doc[node].head.i < node:
|
if doc[node].head.i < node:
|
||||||
return [doc[node].head]
|
return [doc[node].head]
|
||||||
|
|
|
@ -12,31 +12,18 @@ import warnings
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ..attrs cimport (
|
from ..attrs cimport DEP, ENT_IOB, ID, LEMMA, MORPH, NULL_ATTR, POS, TAG
|
||||||
DEP,
|
|
||||||
ENT_IOB,
|
|
||||||
ID,
|
|
||||||
LEMMA,
|
|
||||||
MORPH,
|
|
||||||
NULL_ATTR,
|
|
||||||
ORTH,
|
|
||||||
POS,
|
|
||||||
TAG,
|
|
||||||
attr_id_t,
|
|
||||||
)
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
from ..tokens.doc cimport Doc, get_token_attr_for_matcher
|
||||||
from ..tokens.morphanalysis cimport MorphAnalysis
|
from ..tokens.morphanalysis cimport MorphAnalysis
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
from ..vocab cimport Vocab
|
|
||||||
|
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
from ..errors import Errors, MatchPatternError, Warnings
|
from ..errors import Errors, MatchPatternError, Warnings
|
||||||
from ..schemas import validate_token_pattern
|
from ..schemas import validate_token_pattern
|
||||||
from ..strings import get_string_id
|
from ..strings import get_string_id
|
||||||
from ..util import registry
|
|
||||||
from .levenshtein import levenshtein_compare
|
from .levenshtein import levenshtein_compare
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -87,9 +74,9 @@ cdef class Matcher:
|
||||||
key (str): The match ID.
|
key (str): The match ID.
|
||||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||||
"""
|
"""
|
||||||
return self.has_key(key)
|
return self.has_key(key) # no-cython-lint: W601
|
||||||
|
|
||||||
def add(self, key, patterns, *, on_match=None, greedy: str=None):
|
def add(self, key, patterns, *, on_match=None, greedy: str = None):
|
||||||
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
"""Add a match-rule to the matcher. A match-rule consists of: an ID
|
||||||
key, an on_match callback, and one or more patterns.
|
key, an on_match callback, and one or more patterns.
|
||||||
|
|
||||||
|
@ -143,8 +130,13 @@ cdef class Matcher:
|
||||||
key = self._normalize_key(key)
|
key = self._normalize_key(key)
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
specs = _preprocess_pattern(pattern, self.vocab,
|
specs = _preprocess_pattern(
|
||||||
self._extensions, self._extra_predicates, self._fuzzy_compare)
|
pattern,
|
||||||
|
self.vocab,
|
||||||
|
self._extensions,
|
||||||
|
self._extra_predicates,
|
||||||
|
self._fuzzy_compare
|
||||||
|
)
|
||||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||||
for spec in specs:
|
for spec in specs:
|
||||||
for attr, _ in spec[1]:
|
for attr, _ in spec[1]:
|
||||||
|
@ -168,7 +160,7 @@ cdef class Matcher:
|
||||||
key (str): The ID of the match rule.
|
key (str): The ID of the match rule.
|
||||||
"""
|
"""
|
||||||
norm_key = self._normalize_key(key)
|
norm_key = self._normalize_key(key)
|
||||||
if not norm_key in self._patterns:
|
if norm_key not in self._patterns:
|
||||||
raise ValueError(Errors.E175.format(key=key))
|
raise ValueError(Errors.E175.format(key=key))
|
||||||
self._patterns.pop(norm_key)
|
self._patterns.pop(norm_key)
|
||||||
self._callbacks.pop(norm_key)
|
self._callbacks.pop(norm_key)
|
||||||
|
@ -268,8 +260,15 @@ cdef class Matcher:
|
||||||
if self.patterns.empty():
|
if self.patterns.empty():
|
||||||
matches = []
|
matches = []
|
||||||
else:
|
else:
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(
|
||||||
extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
|
&self.patterns[0],
|
||||||
|
self.patterns.size(),
|
||||||
|
doclike,
|
||||||
|
length,
|
||||||
|
extensions=self._extensions,
|
||||||
|
predicates=self._extra_predicates,
|
||||||
|
with_alignments=with_alignments
|
||||||
|
)
|
||||||
final_matches = []
|
final_matches = []
|
||||||
pairs_by_id = {}
|
pairs_by_id = {}
|
||||||
# For each key, either add all matches, or only the filtered,
|
# For each key, either add all matches, or only the filtered,
|
||||||
|
@ -289,9 +288,9 @@ cdef class Matcher:
|
||||||
memset(matched, 0, length * sizeof(matched[0]))
|
memset(matched, 0, length * sizeof(matched[0]))
|
||||||
span_filter = self._filter.get(key)
|
span_filter = self._filter.get(key)
|
||||||
if span_filter == "FIRST":
|
if span_filter == "FIRST":
|
||||||
sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
|
sorted_pairs = sorted(pairs, key=lambda x: (x[0], -x[1]), reverse=False) # sort by start
|
||||||
elif span_filter == "LONGEST":
|
elif span_filter == "LONGEST":
|
||||||
sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
|
sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
|
raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
|
||||||
for match in sorted_pairs:
|
for match in sorted_pairs:
|
||||||
|
@ -366,7 +365,6 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
cdef vector[MatchC] matches
|
cdef vector[MatchC] matches
|
||||||
cdef vector[vector[MatchAlignmentC]] align_states
|
cdef vector[vector[MatchAlignmentC]] align_states
|
||||||
cdef vector[vector[MatchAlignmentC]] align_matches
|
cdef vector[vector[MatchAlignmentC]] align_matches
|
||||||
cdef PatternStateC state
|
|
||||||
cdef int i, j, nr_extra_attr
|
cdef int i, j, nr_extra_attr
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
output = []
|
output = []
|
||||||
|
@ -388,14 +386,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
value = token.vocab.strings[value]
|
value = token.vocab.strings[value]
|
||||||
extra_attr_values[i * nr_extra_attr + index] = value
|
extra_attr_values[i * nr_extra_attr + index] = value
|
||||||
# Main loop
|
# Main loop
|
||||||
cdef int nr_predicate = len(predicates)
|
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
for j in range(n):
|
for j in range(n):
|
||||||
states.push_back(PatternStateC(patterns[j], i, 0))
|
states.push_back(PatternStateC(patterns[j], i, 0))
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_states.resize(states.size())
|
align_states.resize(states.size())
|
||||||
transition_states(states, matches, align_states, align_matches, predicate_cache,
|
transition_states(
|
||||||
doclike[i], extra_attr_values, predicates, with_alignments)
|
states,
|
||||||
|
matches,
|
||||||
|
align_states,
|
||||||
|
align_matches,
|
||||||
|
predicate_cache,
|
||||||
|
doclike[i],
|
||||||
|
extra_attr_values,
|
||||||
|
predicates,
|
||||||
|
with_alignments
|
||||||
|
)
|
||||||
extra_attr_values += nr_extra_attr
|
extra_attr_values += nr_extra_attr
|
||||||
predicate_cache += len(predicates)
|
predicate_cache += len(predicates)
|
||||||
# Handle matches that end in 0-width patterns
|
# Handle matches that end in 0-width patterns
|
||||||
|
@ -421,18 +427,28 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
|
cdef void transition_states(
|
||||||
vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
|
vector[PatternStateC]& states,
|
||||||
int8_t* cached_py_predicates,
|
vector[MatchC]& matches,
|
||||||
Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
|
vector[vector[MatchAlignmentC]]& align_states,
|
||||||
|
vector[vector[MatchAlignmentC]]& align_matches,
|
||||||
|
int8_t* cached_py_predicates,
|
||||||
|
Token token,
|
||||||
|
const attr_t* extra_attrs,
|
||||||
|
py_predicates,
|
||||||
|
bint with_alignments
|
||||||
|
) except *:
|
||||||
cdef int q = 0
|
cdef int q = 0
|
||||||
cdef vector[PatternStateC] new_states
|
cdef vector[PatternStateC] new_states
|
||||||
cdef vector[vector[MatchAlignmentC]] align_new_states
|
cdef vector[vector[MatchAlignmentC]] align_new_states
|
||||||
cdef int nr_predicate = len(py_predicates)
|
|
||||||
for i in range(states.size()):
|
for i in range(states.size()):
|
||||||
if states[i].pattern.nr_py >= 1:
|
if states[i].pattern.nr_py >= 1:
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(
|
||||||
states[i].pattern, token, py_predicates)
|
cached_py_predicates,
|
||||||
|
states[i].pattern,
|
||||||
|
token,
|
||||||
|
py_predicates
|
||||||
|
)
|
||||||
action = get_action(states[i], token.c, extra_attrs,
|
action = get_action(states[i], token.c, extra_attrs,
|
||||||
cached_py_predicates)
|
cached_py_predicates)
|
||||||
if action == REJECT:
|
if action == REJECT:
|
||||||
|
@ -468,8 +484,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
align_new_states.push_back(align_states[q])
|
align_new_states.push_back(align_states[q])
|
||||||
states[q].pattern += 1
|
states[q].pattern += 1
|
||||||
if states[q].pattern.nr_py != 0:
|
if states[q].pattern.nr_py != 0:
|
||||||
update_predicate_cache(cached_py_predicates,
|
update_predicate_cache(
|
||||||
states[q].pattern, token, py_predicates)
|
cached_py_predicates,
|
||||||
|
states[q].pattern,
|
||||||
|
token,
|
||||||
|
py_predicates
|
||||||
|
)
|
||||||
action = get_action(states[q], token.c, extra_attrs,
|
action = get_action(states[q], token.c, extra_attrs,
|
||||||
cached_py_predicates)
|
cached_py_predicates)
|
||||||
# Update alignment before the transition of current state
|
# Update alignment before the transition of current state
|
||||||
|
@ -485,8 +505,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
ent_id = get_ent_id(state.pattern)
|
ent_id = get_ent_id(state.pattern)
|
||||||
if action == MATCH:
|
if action == MATCH:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length+1))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length+1
|
||||||
|
)
|
||||||
|
)
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
|
@ -494,23 +518,35 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
# push match without last token if length > 0
|
# push match without last token if length > 0
|
||||||
if state.length > 0:
|
if state.length > 0:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length
|
||||||
|
)
|
||||||
|
)
|
||||||
# MATCH_DOUBLE emits matches twice,
|
# MATCH_DOUBLE emits matches twice,
|
||||||
# add one more to align_matches in order to keep 1:1 relationship
|
# add one more to align_matches in order to keep 1:1 relationship
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
# push match with last token
|
# push match with last token
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length+1))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length + 1
|
||||||
|
)
|
||||||
|
)
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
elif action == MATCH_REJECT:
|
elif action == MATCH_REJECT:
|
||||||
matches.push_back(
|
matches.push_back(
|
||||||
MatchC(pattern_id=ent_id, start=state.start,
|
MatchC(
|
||||||
length=state.length))
|
pattern_id=ent_id,
|
||||||
|
start=state.start,
|
||||||
|
length=state.length
|
||||||
|
)
|
||||||
|
)
|
||||||
# `align_matches` always corresponds to `matches` 1:1
|
# `align_matches` always corresponds to `matches` 1:1
|
||||||
if with_alignments != 0:
|
if with_alignments != 0:
|
||||||
align_matches.push_back(align_states[q])
|
align_matches.push_back(align_states[q])
|
||||||
|
@ -533,8 +569,12 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
|
||||||
align_states.push_back(align_new_states[i])
|
align_states.push_back(align_new_states[i])
|
||||||
|
|
||||||
|
|
||||||
cdef int update_predicate_cache(int8_t* cache,
|
cdef int update_predicate_cache(
|
||||||
const TokenPatternC* pattern, Token token, predicates) except -1:
|
int8_t* cache,
|
||||||
|
const TokenPatternC* pattern,
|
||||||
|
Token token,
|
||||||
|
predicates
|
||||||
|
) except -1:
|
||||||
# If the state references any extra predicates, check whether they match.
|
# If the state references any extra predicates, check whether they match.
|
||||||
# These are cached, so that we don't call these potentially expensive
|
# These are cached, so that we don't call these potentially expensive
|
||||||
# Python functions more than we need to.
|
# Python functions more than we need to.
|
||||||
|
@ -580,10 +620,12 @@ cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
|
||||||
else:
|
else:
|
||||||
state.pattern += 1
|
state.pattern += 1
|
||||||
|
|
||||||
|
cdef action_t get_action(
|
||||||
cdef action_t get_action(PatternStateC state,
|
PatternStateC state,
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
const TokenC * token,
|
||||||
const int8_t* predicate_matches) nogil:
|
const attr_t * extra_attrs,
|
||||||
|
const int8_t * predicate_matches
|
||||||
|
) nogil:
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -649,53 +691,56 @@ cdef action_t get_action(PatternStateC state,
|
||||||
is_match = not is_match
|
is_match = not is_match
|
||||||
quantifier = ONE
|
quantifier = ONE
|
||||||
if quantifier == ONE:
|
if quantifier == ONE:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 1000
|
# Yes, final: 1000
|
||||||
return MATCH
|
return MATCH
|
||||||
elif is_match and not is_final:
|
elif is_match and not is_final:
|
||||||
# Yes, non-final: 0100
|
# Yes, non-final: 0100
|
||||||
return ADVANCE
|
return ADVANCE
|
||||||
elif not is_match and is_final:
|
elif not is_match and is_final:
|
||||||
# No, final: 0000
|
# No, final: 0000
|
||||||
return REJECT
|
return REJECT
|
||||||
else:
|
else:
|
||||||
return REJECT
|
return REJECT
|
||||||
elif quantifier == ZERO_PLUS:
|
elif quantifier == ZERO_PLUS:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 1001
|
# Yes, final: 1001
|
||||||
return MATCH_EXTEND
|
return MATCH_EXTEND
|
||||||
elif is_match and not is_final:
|
elif is_match and not is_final:
|
||||||
# Yes, non-final: 0011
|
# Yes, non-final: 0011
|
||||||
return RETRY_EXTEND
|
return RETRY_EXTEND
|
||||||
elif not is_match and is_final:
|
elif not is_match and is_final:
|
||||||
# No, final 2000 (note: Don't include last token!)
|
# No, final 2000 (note: Don't include last token!)
|
||||||
return MATCH_REJECT
|
return MATCH_REJECT
|
||||||
else:
|
else:
|
||||||
# No, non-final 0010
|
# No, non-final 0010
|
||||||
return RETRY
|
return RETRY
|
||||||
elif quantifier == ZERO_ONE:
|
elif quantifier == ZERO_ONE:
|
||||||
if is_match and is_final:
|
if is_match and is_final:
|
||||||
# Yes, final: 3000
|
# Yes, final: 3000
|
||||||
# To cater for a pattern ending in "?", we need to add
|
# To cater for a pattern ending in "?", we need to add
|
||||||
# a match both with and without the last token
|
# a match both with and without the last token
|
||||||
return MATCH_DOUBLE
|
return MATCH_DOUBLE
|
||||||
elif is_match and not is_final:
|
elif is_match and not is_final:
|
||||||
# Yes, non-final: 0110
|
# Yes, non-final: 0110
|
||||||
# We need both branches here, consider a pair like:
|
# We need both branches here, consider a pair like:
|
||||||
# pattern: .?b string: b
|
# pattern: .?b string: b
|
||||||
# If we 'ADVANCE' on the .?, we miss the match.
|
# If we 'ADVANCE' on the .?, we miss the match.
|
||||||
return RETRY_ADVANCE
|
return RETRY_ADVANCE
|
||||||
elif not is_match and is_final:
|
elif not is_match and is_final:
|
||||||
# No, final 2000 (note: Don't include last token!)
|
# No, final 2000 (note: Don't include last token!)
|
||||||
return MATCH_REJECT
|
return MATCH_REJECT
|
||||||
else:
|
else:
|
||||||
# No, non-final 0010
|
# No, non-final 0010
|
||||||
return RETRY
|
return RETRY
|
||||||
|
|
||||||
|
|
||||||
cdef int8_t get_is_match(PatternStateC state,
|
cdef int8_t get_is_match(
|
||||||
const TokenC* token, const attr_t* extra_attrs,
|
PatternStateC state,
|
||||||
const int8_t* predicate_matches) nogil:
|
const TokenC* token,
|
||||||
|
const attr_t* extra_attrs,
|
||||||
|
const int8_t* predicate_matches
|
||||||
|
) nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -860,7 +905,7 @@ class _FuzzyPredicate:
|
||||||
self.is_extension = is_extension
|
self.is_extension = is_extension
|
||||||
if self.predicate not in self.operators:
|
if self.predicate not in self.operators:
|
||||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||||
self.fuzzy = int(fuzz) if fuzz else -1
|
self.fuzzy = int(fuzz) if fuzz else -1
|
||||||
self.fuzzy_compare = fuzzy_compare
|
self.fuzzy_compare = fuzzy_compare
|
||||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||||
|
@ -1082,7 +1127,7 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
elif cls == _FuzzyPredicate:
|
elif cls == _FuzzyPredicate:
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
# add predicates inside fuzzy operator
|
# add predicates inside fuzzy operator
|
||||||
fuzz = type_[len("FUZZY"):] # number after prefix
|
fuzz = type_[len("FUZZY"):] # number after prefix
|
||||||
fuzzy_val = int(fuzz) if fuzz else -1
|
fuzzy_val = int(fuzz) if fuzz else -1
|
||||||
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
output.extend(_get_extra_predicates_dict(attr, value, vocab, predicate_types,
|
||||||
extra_predicates, seen_predicates,
|
extra_predicates, seen_predicates,
|
||||||
|
@ -1101,8 +1146,9 @@ def _get_extra_predicates_dict(attr, value_dict, vocab, predicate_types,
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
def _get_extension_extra_predicates(
|
||||||
seen_predicates):
|
spec, extra_predicates, predicate_types, seen_predicates
|
||||||
|
):
|
||||||
output = []
|
output = []
|
||||||
for attr, value in spec.items():
|
for attr, value in spec.items():
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict):
|
||||||
|
@ -1131,7 +1177,7 @@ def _get_operators(spec):
|
||||||
return (ONE,)
|
return (ONE,)
|
||||||
elif spec["OP"] in lookup:
|
elif spec["OP"] in lookup:
|
||||||
return lookup[spec["OP"]]
|
return lookup[spec["OP"]]
|
||||||
#Min_max {n,m}
|
# Min_max {n,m}
|
||||||
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
||||||
# {n} --> {n,n} exactly n ONE,(n)
|
# {n} --> {n,n} exactly n ONE,(n)
|
||||||
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
||||||
|
@ -1142,8 +1188,8 @@ def _get_operators(spec):
|
||||||
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
||||||
n, m = min_max.split(",")
|
n, m = min_max.split(",")
|
||||||
|
|
||||||
#1. Either n or m is a blank string and the other is numeric -->isdigit
|
# 1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||||
#2. Both are numeric and n <= m
|
# 2. Both are numeric and n <= m
|
||||||
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
||||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||||
|
|
|
@ -1,14 +1,12 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from libc.stdint cimport uintptr_t
|
|
||||||
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
from preshed.maps cimport map_clear, map_get, map_init, map_iter, map_set
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ..attrs cimport DEP, LEMMA, MORPH, ORTH, POS, TAG
|
from ..attrs cimport DEP, LEMMA, MORPH, POS, TAG
|
||||||
|
|
||||||
from ..attrs import IDS
|
from ..attrs import IDS
|
||||||
|
|
||||||
from ..structs cimport TokenC
|
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.token cimport Token
|
from ..tokens.token cimport Token
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
|
|
@ -40,11 +40,16 @@ cdef ActivationsC alloc_activations(SizesC n) nogil
|
||||||
|
|
||||||
cdef void free_activations(const ActivationsC* A) nogil
|
cdef void free_activations(const ActivationsC* A) nogil
|
||||||
|
|
||||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
cdef void predict_states(
|
||||||
const WeightsC* W, SizesC n) nogil
|
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
||||||
|
) nogil
|
||||||
|
|
||||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||||
|
|
||||||
cdef void cpu_log_loss(float* d_scores,
|
cdef void cpu_log_loss(
|
||||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
float* d_scores,
|
||||||
|
const float* costs,
|
||||||
|
const int* is_valid,
|
||||||
|
const float* scores,
|
||||||
|
int O
|
||||||
|
) nogil
|
||||||
|
|
|
@ -8,13 +8,13 @@ from thinc.backends.linalg cimport Vec, VecVec
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.random
|
import numpy.random
|
||||||
from thinc.api import CupyOps, Model, NumpyOps, get_ops
|
from thinc.api import CupyOps, Model, NumpyOps
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
from ..pipeline._parser_internals.stateclass cimport StateClass
|
from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||||
from ..typedefs cimport class_t, hash_t, weight_t
|
from ..typedefs cimport weight_t
|
||||||
|
|
||||||
|
|
||||||
cdef WeightsC get_c_weights(model) except *:
|
cdef WeightsC get_c_weights(model) except *:
|
||||||
|
@ -78,33 +78,48 @@ cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||||
A._max_size = n.states
|
A._max_size = n.states
|
||||||
else:
|
else:
|
||||||
A.token_ids = <int*>realloc(A.token_ids,
|
A.token_ids = <int*>realloc(
|
||||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
|
||||||
A.scores = <float*>realloc(A.scores,
|
)
|
||||||
n.states * n.classes * sizeof(A.scores[0]))
|
A.scores = <float*>realloc(
|
||||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
A.scores, n.states * n.classes * sizeof(A.scores[0])
|
||||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
)
|
||||||
A.hiddens = <float*>realloc(A.hiddens,
|
A.unmaxed = <float*>realloc(
|
||||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
|
||||||
A.is_valid = <int*>realloc(A.is_valid,
|
)
|
||||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
A.hiddens = <float*>realloc(
|
||||||
|
A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
|
||||||
|
)
|
||||||
|
A.is_valid = <int*>realloc(
|
||||||
|
A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
|
||||||
|
)
|
||||||
A._max_size = n.states
|
A._max_size = n.states
|
||||||
A._curr_size = n.states
|
A._curr_size = n.states
|
||||||
|
|
||||||
|
|
||||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
cdef void predict_states(
|
||||||
const WeightsC* W, SizesC n) nogil:
|
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
||||||
cdef double one = 1.0
|
) nogil:
|
||||||
resize_activations(A, n)
|
resize_activations(A, n)
|
||||||
for i in range(n.states):
|
for i in range(n.states):
|
||||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||||
sum_state_features(cblas, A.unmaxed,
|
sum_state_features(
|
||||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
cblas,
|
||||||
|
A.unmaxed,
|
||||||
|
W.feat_weights,
|
||||||
|
A.token_ids,
|
||||||
|
n.states,
|
||||||
|
n.feats,
|
||||||
|
n.hiddens * n.pieces
|
||||||
|
)
|
||||||
for i in range(n.states):
|
for i in range(n.states):
|
||||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
VecVec.add_i(
|
||||||
W.feat_bias, 1., n.hiddens * n.pieces)
|
&A.unmaxed[i*n.hiddens*n.pieces],
|
||||||
|
W.feat_bias, 1.,
|
||||||
|
n.hiddens * n.pieces
|
||||||
|
)
|
||||||
for j in range(n.hiddens):
|
for j in range(n.hiddens):
|
||||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||||
|
@ -114,14 +129,15 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||||
else:
|
else:
|
||||||
# Compute hidden-to-output
|
# Compute hidden-to-output
|
||||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
sgemm(cblas)(
|
||||||
|
False, True, n.states, n.classes, n.hiddens,
|
||||||
1.0, <const float *>A.hiddens, n.hiddens,
|
1.0, <const float *>A.hiddens, n.hiddens,
|
||||||
<const float *>W.hidden_weights, n.hiddens,
|
<const float *>W.hidden_weights, n.hiddens,
|
||||||
0.0, A.scores, n.classes)
|
0.0, A.scores, n.classes
|
||||||
|
)
|
||||||
# Add bias
|
# Add bias
|
||||||
for i in range(n.states):
|
for i in range(n.states):
|
||||||
VecVec.add_i(&A.scores[i*n.classes],
|
VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
|
||||||
W.hidden_bias, 1., n.classes)
|
|
||||||
# Set unseen classes to minimum value
|
# Set unseen classes to minimum value
|
||||||
i = 0
|
i = 0
|
||||||
min_ = A.scores[0]
|
min_ = A.scores[0]
|
||||||
|
@ -134,9 +150,16 @@ cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
||||||
A.scores[i*n.classes+j] = min_
|
A.scores[i*n.classes+j] = min_
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(CBlas cblas, float* output,
|
cdef void sum_state_features(
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
CBlas cblas,
|
||||||
cdef int idx, b, f, i
|
float* output,
|
||||||
|
const float* cached,
|
||||||
|
const int* token_ids,
|
||||||
|
int B,
|
||||||
|
int F,
|
||||||
|
int O
|
||||||
|
) nogil:
|
||||||
|
cdef int idx, b, f
|
||||||
cdef const float* feature
|
cdef const float* feature
|
||||||
padding = cached
|
padding = cached
|
||||||
cached += F * O
|
cached += F * O
|
||||||
|
@ -153,9 +176,13 @@ cdef void sum_state_features(CBlas cblas, float* output,
|
||||||
token_ids += F
|
token_ids += F
|
||||||
|
|
||||||
|
|
||||||
cdef void cpu_log_loss(float* d_scores,
|
cdef void cpu_log_loss(
|
||||||
const float* costs, const int* is_valid, const float* scores,
|
float* d_scores,
|
||||||
int O) nogil:
|
const float* costs,
|
||||||
|
const int* is_valid,
|
||||||
|
const float* scores,
|
||||||
|
int O
|
||||||
|
) nogil:
|
||||||
"""Do multi-label log loss"""
|
"""Do multi-label log loss"""
|
||||||
cdef double max_, gmax, Z, gZ
|
cdef double max_, gmax, Z, gZ
|
||||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||||
|
@ -179,8 +206,9 @@ cdef void cpu_log_loss(float* d_scores,
|
||||||
d_scores[i] = exp(scores[i]-max_) / Z
|
d_scores[i] = exp(scores[i]-max_) / Z
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
cdef int arg_max_if_gold(
|
||||||
const int* is_valid, int n) nogil:
|
const weight_t* scores, const weight_t* costs, const int* is_valid, int n
|
||||||
|
) nogil:
|
||||||
# Find minimum cost
|
# Find minimum cost
|
||||||
cdef float cost = 1
|
cdef float cost = 1
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
|
@ -204,10 +232,17 @@ cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) no
|
||||||
return best
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ParserStepModel(Model):
|
class ParserStepModel(Model):
|
||||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
def __init__(
|
||||||
dropout=0.1):
|
self,
|
||||||
|
docs,
|
||||||
|
layers,
|
||||||
|
*,
|
||||||
|
has_upper,
|
||||||
|
unseen_classes=None,
|
||||||
|
train=True,
|
||||||
|
dropout=0.1
|
||||||
|
):
|
||||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||||
self.attrs["has_upper"] = has_upper
|
self.attrs["has_upper"] = has_upper
|
||||||
self.attrs["dropout_rate"] = dropout
|
self.attrs["dropout_rate"] = dropout
|
||||||
|
@ -268,8 +303,10 @@ class ParserStepModel(Model):
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||||
if isinstance(self.state2vec.ops, CupyOps) \
|
if (
|
||||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
isinstance(self.state2vec.ops, CupyOps)
|
||||||
|
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
|
||||||
|
):
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
self.backprops.append((
|
self.backprops.append((
|
||||||
util.get_async(self.cuda_stream, token_ids),
|
util.get_async(self.cuda_stream, token_ids),
|
||||||
|
@ -279,7 +316,6 @@ class ParserStepModel(Model):
|
||||||
else:
|
else:
|
||||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||||
|
|
||||||
|
|
||||||
def finish_steps(self, golds):
|
def finish_steps(self, golds):
|
||||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||||
# values don't affect the real gradient.
|
# values don't affect the real gradient.
|
||||||
|
@ -292,14 +328,15 @@ class ParserStepModel(Model):
|
||||||
ids = ids.flatten()
|
ids = ids.flatten()
|
||||||
d_state_features = d_state_features.reshape(
|
d_state_features = d_state_features.reshape(
|
||||||
(ids.size, d_state_features.shape[2]))
|
(ids.size, d_state_features.shape[2]))
|
||||||
self.ops.scatter_add(d_tokvecs, ids,
|
self.ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||||||
d_state_features)
|
|
||||||
# Padded -- see update()
|
# Padded -- see update()
|
||||||
self.bp_tokvecs(d_tokvecs[:-1])
|
self.bp_tokvecs(d_tokvecs[:-1])
|
||||||
return d_tokvecs
|
return d_tokvecs
|
||||||
|
|
||||||
|
|
||||||
NUMPY_OPS = NumpyOps()
|
NUMPY_OPS = NumpyOps()
|
||||||
|
|
||||||
|
|
||||||
def step_forward(model: ParserStepModel, states, is_train):
|
def step_forward(model: ParserStepModel, states, is_train):
|
||||||
token_ids = model.get_token_ids(states)
|
token_ids = model.get_token_ids(states)
|
||||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||||
|
@ -312,7 +349,7 @@ def step_forward(model: ParserStepModel, states, is_train):
|
||||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||||
else:
|
else:
|
||||||
scores = NumpyOps().asarray(vector)
|
scores = NumpyOps().asarray(vector)
|
||||||
get_d_vector = lambda d_scores: d_scores
|
get_d_vector = lambda d_scores: d_scores # no-cython-lint: E731
|
||||||
# If the class is unseen, make sure its score is minimum
|
# If the class is unseen, make sure its score is minimum
|
||||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
||||||
|
|
||||||
|
@ -448,9 +485,11 @@ cdef class precompute_hiddens:
|
||||||
|
|
||||||
feat_weights = self.get_feat_weights()
|
feat_weights = self.get_feat_weights()
|
||||||
cdef int[:, ::1] ids = token_ids
|
cdef int[:, ::1] ids = token_ids
|
||||||
sum_state_features(cblas, <float*>state_vector.data,
|
sum_state_features(
|
||||||
feat_weights, &ids[0,0],
|
cblas, <float*>state_vector.data,
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
feat_weights, &ids[0, 0],
|
||||||
|
token_ids.shape[0], self.nF, self.nO*self.nP
|
||||||
|
)
|
||||||
state_vector += self.bias
|
state_vector += self.bias
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
|
@ -475,7 +514,7 @@ cdef class precompute_hiddens:
|
||||||
|
|
||||||
def backprop_maxout(d_best):
|
def backprop_maxout(d_best):
|
||||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||||
|
|
||||||
return state_vector, backprop_maxout
|
return state_vector, backprop_maxout
|
||||||
|
|
||||||
def _relu_nonlinearity(self, state_vector):
|
def _relu_nonlinearity(self, state_vector):
|
||||||
|
@ -489,5 +528,5 @@ cdef class precompute_hiddens:
|
||||||
def backprop_relu(d_best):
|
def backprop_relu(d_best):
|
||||||
d_best *= mask
|
d_best *= mask
|
||||||
return d_best.reshape((d_best.shape + (1,)))
|
return d_best.reshape((d_best.shape + (1,)))
|
||||||
|
|
||||||
return state_vector, backprop_relu
|
return state_vector, backprop_relu
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import warnings
|
||||||
from typing import Callable, List, Optional, Sequence, Tuple, cast
|
from typing import Callable, List, Optional, Sequence, Tuple, cast
|
||||||
|
|
||||||
from thinc.api import Model, Ops, registry
|
from thinc.api import Model, Ops, registry
|
||||||
|
@ -5,7 +6,8 @@ from thinc.initializers import glorot_uniform_init
|
||||||
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
from thinc.types import Floats1d, Floats2d, Ints1d, Ragged
|
||||||
from thinc.util import partial
|
from thinc.util import partial
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..attrs import ORTH
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..vectors import Mode
|
from ..vectors import Mode
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
@ -24,6 +26,8 @@ def StaticVectors(
|
||||||
linear projection to control the dimensionality. If a dropout rate is
|
linear projection to control the dimensionality. If a dropout rate is
|
||||||
specified, the dropout is applied per dimension over the whole batch.
|
specified, the dropout is applied per dimension over the whole batch.
|
||||||
"""
|
"""
|
||||||
|
if key_attr != "ORTH":
|
||||||
|
warnings.warn(Warnings.W125, DeprecationWarning)
|
||||||
return Model(
|
return Model(
|
||||||
"static_vectors",
|
"static_vectors",
|
||||||
forward,
|
forward,
|
||||||
|
@ -40,9 +44,9 @@ def forward(
|
||||||
token_count = sum(len(doc) for doc in docs)
|
token_count = sum(len(doc) for doc in docs)
|
||||||
if not token_count:
|
if not token_count:
|
||||||
return _handle_empty(model.ops, model.get_dim("nO"))
|
return _handle_empty(model.ops, model.get_dim("nO"))
|
||||||
key_attr: int = model.attrs["key_attr"]
|
|
||||||
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
|
|
||||||
vocab: Vocab = docs[0].vocab
|
vocab: Vocab = docs[0].vocab
|
||||||
|
key_attr: int = getattr(vocab.vectors, "attr", ORTH)
|
||||||
|
keys = model.ops.flatten([cast(Ints1d, doc.to_array(key_attr)) for doc in docs])
|
||||||
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
|
||||||
if vocab.vectors.mode == Mode.default:
|
if vocab.vectors.mode == Mode.default:
|
||||||
V = model.ops.asarray(vocab.vectors.data)
|
V = model.ops.asarray(vocab.vectors.data)
|
||||||
|
|
|
@ -11,7 +11,7 @@ from .typedefs cimport attr_t, hash_t
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
cdef readonly Pool mem
|
cdef readonly Pool mem
|
||||||
cdef readonly StringStore strings
|
cdef readonly StringStore strings
|
||||||
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
cdef PreshMap tags # Keyed by hash, value is pointer to tag
|
||||||
|
|
||||||
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
|
||||||
cdef int insert(self, MorphAnalysisC tag) except -1
|
cdef int insert(self, MorphAnalysisC tag) except -1
|
||||||
|
@ -20,4 +20,8 @@ cdef class Morphology:
|
||||||
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
|
cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
|
||||||
cdef list list_features(const MorphAnalysisC* morph)
|
cdef list list_features(const MorphAnalysisC* morph)
|
||||||
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
|
cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
|
||||||
cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
|
cdef int get_n_by_field(
|
||||||
|
attr_t* results,
|
||||||
|
const MorphAnalysisC* morph,
|
||||||
|
attr_t field,
|
||||||
|
) nogil
|
||||||
|
|
|
@ -83,10 +83,11 @@ cdef class Morphology:
|
||||||
features = self.normalize_attrs(features)
|
features = self.normalize_attrs(features)
|
||||||
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
|
||||||
# normalized UFEATS string with sorted fields and values
|
# normalized UFEATS string with sorted fields and values
|
||||||
norm_feats_string = self.FEATURE_SEP.join(sorted([
|
norm_feats_string = self.FEATURE_SEP.join(
|
||||||
self.FIELD_SEP.join([field, values])
|
sorted(
|
||||||
for field, values in string_features.items()
|
[self.FIELD_SEP.join([field, values]) for field, values in string_features.items()]
|
||||||
]))
|
)
|
||||||
|
)
|
||||||
return norm_feats_string or self.EMPTY_MORPH
|
return norm_feats_string or self.EMPTY_MORPH
|
||||||
|
|
||||||
def normalize_attrs(self, attrs):
|
def normalize_attrs(self, attrs):
|
||||||
|
@ -192,6 +193,7 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie
|
||||||
n_results += 1
|
n_results += 1
|
||||||
return n_results
|
return n_results
|
||||||
|
|
||||||
|
|
||||||
def unpickle_morphology(strings, tags):
|
def unpickle_morphology(strings, tags):
|
||||||
cdef Morphology morphology = Morphology(strings)
|
cdef Morphology morphology = Morphology(strings)
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
|
|
|
@ -8,7 +8,7 @@ cpdef enum univ_pos_t:
|
||||||
ADV
|
ADV
|
||||||
AUX
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
CCONJ # U20
|
CCONJ # U20
|
||||||
DET
|
DET
|
||||||
INTJ
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
|
|
|
@ -46,11 +46,18 @@ cdef struct EditTreeC:
|
||||||
bint is_match_node
|
bint is_match_node
|
||||||
NodeC inner
|
NodeC inner
|
||||||
|
|
||||||
cdef inline EditTreeC edittree_new_match(len_t prefix_len, len_t suffix_len,
|
cdef inline EditTreeC edittree_new_match(
|
||||||
uint32_t prefix_tree, uint32_t suffix_tree):
|
len_t prefix_len,
|
||||||
cdef MatchNodeC match_node = MatchNodeC(prefix_len=prefix_len,
|
len_t suffix_len,
|
||||||
suffix_len=suffix_len, prefix_tree=prefix_tree,
|
uint32_t prefix_tree,
|
||||||
suffix_tree=suffix_tree)
|
uint32_t suffix_tree
|
||||||
|
):
|
||||||
|
cdef MatchNodeC match_node = MatchNodeC(
|
||||||
|
prefix_len=prefix_len,
|
||||||
|
suffix_len=suffix_len,
|
||||||
|
prefix_tree=prefix_tree,
|
||||||
|
suffix_tree=suffix_tree
|
||||||
|
)
|
||||||
cdef NodeC inner = NodeC(match_node=match_node)
|
cdef NodeC inner = NodeC(match_node=match_node)
|
||||||
return EditTreeC(is_match_node=True, inner=inner)
|
return EditTreeC(is_match_node=True, inner=inner)
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,6 @@ from libc.string cimport memset
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from ...typedefs cimport hash_t
|
from ...typedefs cimport hash_t
|
||||||
|
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -25,17 +23,16 @@ cdef LCS find_lcs(str source, str target):
|
||||||
target (str): The second string.
|
target (str): The second string.
|
||||||
RETURNS (LCS): The spans of the longest common subsequences.
|
RETURNS (LCS): The spans of the longest common subsequences.
|
||||||
"""
|
"""
|
||||||
cdef Py_ssize_t source_len = len(source)
|
|
||||||
cdef Py_ssize_t target_len = len(target)
|
cdef Py_ssize_t target_len = len(target)
|
||||||
cdef size_t longest_align = 0;
|
cdef size_t longest_align = 0
|
||||||
cdef int source_idx, target_idx
|
cdef int source_idx, target_idx
|
||||||
cdef LCS lcs
|
cdef LCS lcs
|
||||||
cdef Py_UCS4 source_cp, target_cp
|
cdef Py_UCS4 source_cp, target_cp
|
||||||
|
|
||||||
memset(&lcs, 0, sizeof(lcs))
|
memset(&lcs, 0, sizeof(lcs))
|
||||||
|
|
||||||
cdef vector[size_t] prev_aligns = vector[size_t](target_len);
|
cdef vector[size_t] prev_aligns = vector[size_t](target_len)
|
||||||
cdef vector[size_t] cur_aligns = vector[size_t](target_len);
|
cdef vector[size_t] cur_aligns = vector[size_t](target_len)
|
||||||
|
|
||||||
for (source_idx, source_cp) in enumerate(source):
|
for (source_idx, source_cp) in enumerate(source):
|
||||||
for (target_idx, target_cp) in enumerate(target):
|
for (target_idx, target_cp) in enumerate(target):
|
||||||
|
@ -89,7 +86,7 @@ cdef class EditTrees:
|
||||||
cdef LCS lcs = find_lcs(form, lemma)
|
cdef LCS lcs = find_lcs(form, lemma)
|
||||||
|
|
||||||
cdef EditTreeC tree
|
cdef EditTreeC tree
|
||||||
cdef uint32_t tree_id, prefix_tree, suffix_tree
|
cdef uint32_t prefix_tree, suffix_tree
|
||||||
if lcs_is_empty(lcs):
|
if lcs_is_empty(lcs):
|
||||||
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
tree = edittree_new_subst(self.strings.add(form), self.strings.add(lemma))
|
||||||
else:
|
else:
|
||||||
|
@ -108,7 +105,7 @@ cdef class EditTrees:
|
||||||
return self._tree_id(tree)
|
return self._tree_id(tree)
|
||||||
|
|
||||||
cdef uint32_t _tree_id(self, EditTreeC tree):
|
cdef uint32_t _tree_id(self, EditTreeC tree):
|
||||||
# If this tree has been constructed before, return its identifier.
|
# If this tree has been constructed before, return its identifier.
|
||||||
cdef hash_t hash = edittree_hash(tree)
|
cdef hash_t hash = edittree_hash(tree)
|
||||||
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
|
cdef unordered_map[hash_t, uint32_t].iterator iter = self.map.find(hash)
|
||||||
if iter != self.map.end():
|
if iter != self.map.end():
|
||||||
|
@ -289,6 +286,7 @@ def _tree2dict(tree):
|
||||||
tree = tree["inner"]["subst_node"]
|
tree = tree["inner"]["subst_node"]
|
||||||
return(dict(tree))
|
return(dict(tree))
|
||||||
|
|
||||||
|
|
||||||
def _dict2tree(tree):
|
def _dict2tree(tree):
|
||||||
errors = validate_edit_tree(tree)
|
errors = validate_edit_tree(tree)
|
||||||
if errors:
|
if errors:
|
||||||
|
|
|
@ -1,17 +1,14 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
cimport numpy as np
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from cpython.ref cimport Py_XDECREF, PyObject
|
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
|
|
||||||
from thinc.extra.search import MaxViolation
|
from thinc.extra.search import MaxViolation
|
||||||
|
|
||||||
from thinc.extra.search cimport MaxViolation
|
from thinc.extra.search cimport MaxViolation
|
||||||
|
|
||||||
from ...typedefs cimport class_t, hash_t
|
from ...typedefs cimport class_t
|
||||||
from .transition_system cimport Transition, TransitionSystem
|
from .transition_system cimport Transition, TransitionSystem
|
||||||
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
@ -146,7 +143,6 @@ def update_beam(TransitionSystem moves, states, golds, model, int width, beam_de
|
||||||
cdef MaxViolation violn
|
cdef MaxViolation violn
|
||||||
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
|
pbeam = BeamBatch(moves, states, golds, width=width, density=beam_density)
|
||||||
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
|
gbeam = BeamBatch(moves, states, golds, width=width, density=0.0)
|
||||||
cdef StateClass state
|
|
||||||
beam_maps = []
|
beam_maps = []
|
||||||
backprops = []
|
backprops = []
|
||||||
violns = [MaxViolation() for _ in range(len(states))]
|
violns = [MaxViolation() for _ in range(len(states))]
|
||||||
|
|
|
@ -277,7 +277,6 @@ cdef cppclass StateC:
|
||||||
|
|
||||||
return n
|
return n
|
||||||
|
|
||||||
|
|
||||||
int n_L(int head) nogil const:
|
int n_L(int head) nogil const:
|
||||||
return n_arcs(this._left_arcs, head)
|
return n_arcs(this._left_arcs, head)
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ...strings cimport hash_string
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc, set_children_from_heads
|
from ...tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ...tokens.token cimport MISSING_DEP
|
from ...tokens.token cimport MISSING_DEP
|
||||||
from ...typedefs cimport attr_t, hash_t
|
from ...typedefs cimport attr_t
|
||||||
|
|
||||||
from ...training import split_bilu_label
|
from ...training import split_bilu_label
|
||||||
|
|
||||||
|
@ -68,8 +68,9 @@ cdef struct GoldParseStateC:
|
||||||
weight_t pop_cost
|
weight_t pop_cost
|
||||||
|
|
||||||
|
|
||||||
cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
cdef GoldParseStateC create_gold_state(
|
||||||
heads, labels, sent_starts) except *:
|
Pool mem, const StateC* state, heads, labels, sent_starts
|
||||||
|
) except *:
|
||||||
cdef GoldParseStateC gs
|
cdef GoldParseStateC gs
|
||||||
gs.length = len(heads)
|
gs.length = len(heads)
|
||||||
gs.stride = 1
|
gs.stride = 1
|
||||||
|
@ -82,7 +83,7 @@ cdef GoldParseStateC create_gold_state(Pool mem, const StateC* state,
|
||||||
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
|
gs.n_kids_in_stack = <int32_t*>mem.alloc(gs.length, sizeof(gs.n_kids_in_stack[0]))
|
||||||
|
|
||||||
for i, is_sent_start in enumerate(sent_starts):
|
for i, is_sent_start in enumerate(sent_starts):
|
||||||
if is_sent_start == True:
|
if is_sent_start is True:
|
||||||
gs.state_bits[i] = set_state_flag(
|
gs.state_bits[i] = set_state_flag(
|
||||||
gs.state_bits[i],
|
gs.state_bits[i],
|
||||||
IS_SENT_START,
|
IS_SENT_START,
|
||||||
|
@ -210,6 +211,7 @@ cdef class ArcEagerGold:
|
||||||
def update(self, StateClass stcls):
|
def update(self, StateClass stcls):
|
||||||
update_gold_state(&self.c, stcls.c)
|
update_gold_state(&self.c, stcls.c)
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_sent_starts(example):
|
def _get_aligned_sent_starts(example):
|
||||||
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
"""Get list of SENT_START attributes aligned to the predicted tokenization.
|
||||||
If the reference has not sentence starts, return a list of None values.
|
If the reference has not sentence starts, return a list of None values.
|
||||||
|
@ -524,7 +526,6 @@ cdef class Break:
|
||||||
"""
|
"""
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||||
cdef int i
|
|
||||||
if st.buffer_length() < 2:
|
if st.buffer_length() < 2:
|
||||||
return False
|
return False
|
||||||
elif st.B(1) != st.B(0) + 1:
|
elif st.B(1) != st.B(0) + 1:
|
||||||
|
@ -556,8 +557,8 @@ cdef class Break:
|
||||||
cost -= 1
|
cost -= 1
|
||||||
if gold.heads[si] == b0:
|
if gold.heads[si] == b0:
|
||||||
cost -= 1
|
cost -= 1
|
||||||
if not is_sent_start(gold, state.B(1)) \
|
if not is_sent_start(gold, state.B(1)) and\
|
||||||
and not is_sent_start_unknown(gold, state.B(1)):
|
not is_sent_start_unknown(gold, state.B(1)):
|
||||||
cost += 1
|
cost += 1
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
@ -803,7 +804,6 @@ cdef class ArcEager(TransitionSystem):
|
||||||
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
raise TypeError(Errors.E909.format(name="ArcEagerGold"))
|
||||||
cdef ArcEagerGold gold_ = gold
|
cdef ArcEagerGold gold_ = gold
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
n_gold = 0
|
|
||||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||||
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
||||||
else:
|
else:
|
||||||
|
@ -875,7 +875,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
print("Gold")
|
print("Gold")
|
||||||
for token in example.y:
|
for token in example.y:
|
||||||
print(token.i, token.text, token.dep_, token.head.text)
|
print(token.i, token.text, token.dep_, token.head.text)
|
||||||
aligned_heads, aligned_labels = example.get_aligned_parse()
|
aligned_heads, _aligned_labels = example.get_aligned_parse()
|
||||||
print("Aligned heads")
|
print("Aligned heads")
|
||||||
for i, head in enumerate(aligned_heads):
|
for i, head in enumerate(aligned_heads):
|
||||||
print(example.x[i], example.x[head] if head is not None else "__")
|
print(example.x[i], example.x[head] if head is not None else "__")
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
import os
|
|
||||||
import random
|
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
|
|
||||||
|
@ -14,7 +11,7 @@ from ...tokens.span import Span
|
||||||
|
|
||||||
from ...attrs cimport IS_SPACE
|
from ...attrs cimport IS_SPACE
|
||||||
from ...lexeme cimport Lexeme
|
from ...lexeme cimport Lexeme
|
||||||
from ...structs cimport SpanC, TokenC
|
from ...structs cimport SpanC
|
||||||
from ...tokens.span cimport Span
|
from ...tokens.span cimport Span
|
||||||
from ...typedefs cimport attr_t, weight_t
|
from ...typedefs cimport attr_t, weight_t
|
||||||
|
|
||||||
|
@ -141,11 +138,10 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
OUT: Counter()
|
OUT: Counter()
|
||||||
}
|
}
|
||||||
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
|
actions[OUT][''] = 1 # Represents a token predicted to be outside of any entity
|
||||||
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
actions[UNIT][''] = 1 # Represents a token prohibited to be in an entity
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
actions[action][entity_type] = 1
|
actions[action][entity_type] = 1
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
|
||||||
for example in kwargs.get('examples', []):
|
for example in kwargs.get('examples', []):
|
||||||
for token in example.y:
|
for token in example.y:
|
||||||
ent_type = token.ent_type_
|
ent_type = token.ent_type_
|
||||||
|
@ -164,7 +160,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
if token.ent_type:
|
if token.ent_type:
|
||||||
labels.add(token.ent_type_)
|
labels.add(token.ent_type_)
|
||||||
return labels
|
return labels
|
||||||
|
|
||||||
def move_name(self, int move, attr_t label):
|
def move_name(self, int move, attr_t label):
|
||||||
if move == OUT:
|
if move == OUT:
|
||||||
return 'O'
|
return 'O'
|
||||||
|
@ -325,7 +321,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
raise TypeError(Errors.E909.format(name="BiluoGold"))
|
||||||
cdef BiluoGold gold_ = gold
|
cdef BiluoGold gold_ = gold
|
||||||
gold_state = gold_.c
|
gold_state = gold_.c
|
||||||
n_gold = 0
|
|
||||||
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
if self.c[i].is_valid(stcls.c, self.c[i].label):
|
||||||
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
cost = self.c[i].get_cost(stcls.c, &gold_state, self.c[i].label)
|
||||||
else:
|
else:
|
||||||
|
@ -486,10 +481,8 @@ cdef class In:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
move = IN
|
|
||||||
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
|
cdef int next_act = gold.ner[s.B(1)].move if s.B(1) >= 0 else OUT
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
|
||||||
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
|
||||||
|
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
|
@ -549,12 +542,10 @@ cdef class Last:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
move = LAST
|
|
||||||
b0 = s.B(0)
|
b0 = s.B(0)
|
||||||
ent_start = s.E(0)
|
ent_start = s.E(0)
|
||||||
|
|
||||||
cdef int g_act = gold.ner[b0].move
|
cdef int g_act = gold.ner[b0].move
|
||||||
cdef attr_t g_tag = gold.ner[b0].label
|
|
||||||
|
|
||||||
cdef int cost = 0
|
cdef int cost = 0
|
||||||
|
|
||||||
|
@ -650,7 +641,6 @@ cdef class Unit:
|
||||||
cost += 1
|
cost += 1
|
||||||
break
|
break
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Out:
|
cdef class Out:
|
||||||
|
@ -675,7 +665,6 @@ cdef class Out:
|
||||||
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
cdef weight_t cost(const StateC* s, const void* _gold, attr_t label) nogil:
|
||||||
gold = <GoldNERStateC*>_gold
|
gold = <GoldNERStateC*>_gold
|
||||||
cdef int g_act = gold.ner[s.B(0)].move
|
cdef int g_act = gold.ner[s.B(0)].move
|
||||||
cdef attr_t g_tag = gold.ner[s.B(0)].label
|
|
||||||
cdef weight_t cost = 0
|
cdef weight_t cost = 0
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -125,14 +125,17 @@ def decompose(label):
|
||||||
def is_decorated(label):
|
def is_decorated(label):
|
||||||
return DELIMITER in label
|
return DELIMITER in label
|
||||||
|
|
||||||
|
|
||||||
def count_decorated_labels(gold_data):
|
def count_decorated_labels(gold_data):
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for example in gold_data:
|
for example in gold_data:
|
||||||
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
||||||
example.get_aligned("DEP"))
|
example.get_aligned("DEP"))
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
deco_deps = [
|
||||||
for i, head in enumerate(proj_heads)]
|
'ROOT' if head == i else deco_deps[i]
|
||||||
|
for i, head in enumerate(proj_heads)
|
||||||
|
]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
for label in deco_deps:
|
for label in deco_deps:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
|
@ -160,9 +163,9 @@ def projectivize(heads, labels):
|
||||||
|
|
||||||
|
|
||||||
cdef vector[int] _heads_to_c(heads):
|
cdef vector[int] _heads_to_c(heads):
|
||||||
cdef vector[int] c_heads;
|
cdef vector[int] c_heads
|
||||||
for head in heads:
|
for head in heads:
|
||||||
if head == None:
|
if head is None:
|
||||||
c_heads.push_back(-1)
|
c_heads.push_back(-1)
|
||||||
else:
|
else:
|
||||||
assert head < len(heads)
|
assert head < len(heads)
|
||||||
|
@ -199,6 +202,7 @@ def _decorate(heads, proj_heads, labels):
|
||||||
deco_labels.append(labels[tokenid])
|
deco_labels.append(labels[tokenid])
|
||||||
return deco_labels
|
return deco_labels
|
||||||
|
|
||||||
|
|
||||||
def get_smallest_nonproj_arc_slow(heads):
|
def get_smallest_nonproj_arc_slow(heads):
|
||||||
cdef vector[int] c_heads = _heads_to_c(heads)
|
cdef vector[int] c_heads = _heads_to_c(heads)
|
||||||
return _get_smallest_nonproj_arc(c_heads)
|
return _get_smallest_nonproj_arc(c_heads)
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
import numpy
|
|
||||||
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from ...tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
|
@ -38,11 +36,11 @@ cdef class StateClass:
|
||||||
cdef vector[ArcC] arcs
|
cdef vector[ArcC] arcs
|
||||||
self.c.get_arcs(&arcs)
|
self.c.get_arcs(&arcs)
|
||||||
return list(arcs)
|
return list(arcs)
|
||||||
#py_arcs = []
|
# py_arcs = []
|
||||||
#for arc in arcs:
|
# for arc in arcs:
|
||||||
# if arc.head != -1 and arc.child != -1:
|
# if arc.head != -1 and arc.child != -1:
|
||||||
# py_arcs.append((arc.head, arc.child, arc.label))
|
# py_arcs.append((arc.head, arc.child, arc.label))
|
||||||
#return arcs
|
# return arcs
|
||||||
|
|
||||||
def add_arc(self, int head, int child, int label):
|
def add_arc(self, int head, int child, int label):
|
||||||
self.c.add_arc(head, child, label)
|
self.c.add_arc(head, child, label)
|
||||||
|
@ -52,10 +50,10 @@ cdef class StateClass:
|
||||||
|
|
||||||
def H(self, int child):
|
def H(self, int child):
|
||||||
return self.c.H(child)
|
return self.c.H(child)
|
||||||
|
|
||||||
def L(self, int head, int idx):
|
def L(self, int head, int idx):
|
||||||
return self.c.L(head, idx)
|
return self.c.L(head, idx)
|
||||||
|
|
||||||
def R(self, int head, int idx):
|
def R(self, int head, int idx):
|
||||||
return self.c.R(head, idx)
|
return self.c.R(head, idx)
|
||||||
|
|
||||||
|
@ -98,7 +96,7 @@ cdef class StateClass:
|
||||||
|
|
||||||
def H(self, int i):
|
def H(self, int i):
|
||||||
return self.c.H(i)
|
return self.c.H(i)
|
||||||
|
|
||||||
def E(self, int i):
|
def E(self, int i):
|
||||||
return self.c.E(i)
|
return self.c.E(i)
|
||||||
|
|
||||||
|
@ -116,7 +114,7 @@ cdef class StateClass:
|
||||||
|
|
||||||
def H_(self, int i):
|
def H_(self, int i):
|
||||||
return self.doc[self.c.H(i)]
|
return self.doc[self.c.H(i)]
|
||||||
|
|
||||||
def E_(self, int i):
|
def E_(self, int i):
|
||||||
return self.doc[self.c.E(i)]
|
return self.doc[self.c.E(i)]
|
||||||
|
|
||||||
|
@ -125,7 +123,7 @@ cdef class StateClass:
|
||||||
|
|
||||||
def R_(self, int i, int idx):
|
def R_(self, int i, int idx):
|
||||||
return self.doc[self.c.R(i, idx)]
|
return self.doc[self.c.R(i, idx)]
|
||||||
|
|
||||||
def empty(self):
|
def empty(self):
|
||||||
return self.c.empty()
|
return self.c.empty()
|
||||||
|
|
||||||
|
@ -134,7 +132,7 @@ cdef class StateClass:
|
||||||
|
|
||||||
def at_break(self):
|
def at_break(self):
|
||||||
return False
|
return False
|
||||||
#return self.c.at_break()
|
# return self.c.at_break()
|
||||||
|
|
||||||
def has_head(self, int i):
|
def has_head(self, int i):
|
||||||
return self.c.has_head(i)
|
return self.c.has_head(i)
|
||||||
|
|
|
@ -20,11 +20,15 @@ cdef struct Transition:
|
||||||
int (*do)(StateC* state, attr_t label) nogil
|
int (*do)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
|
|
||||||
ctypedef weight_t (*get_cost_func_t)(const StateC* state, const void* gold,
|
ctypedef weight_t (*get_cost_func_t)(
|
||||||
attr_tlabel) nogil
|
const StateC* state, const void* gold, attr_tlabel
|
||||||
ctypedef weight_t (*move_cost_func_t)(const StateC* state, const void* gold) nogil
|
) nogil
|
||||||
ctypedef weight_t (*label_cost_func_t)(const StateC* state, const void*
|
ctypedef weight_t (*move_cost_func_t)(
|
||||||
gold, attr_t label) nogil
|
const StateC* state, const void* gold
|
||||||
|
) nogil
|
||||||
|
ctypedef weight_t (*label_cost_func_t)(
|
||||||
|
const StateC* state, const void* gold, attr_t label
|
||||||
|
) nogil
|
||||||
|
|
||||||
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,7 @@ from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from ...tokens.doc cimport Doc
|
|
||||||
from ...typedefs cimport attr_t, weight_t
|
from ...typedefs cimport attr_t, weight_t
|
||||||
from . cimport _beam_utils
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -231,7 +229,6 @@ cdef class TransitionSystem:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple()):
|
def to_bytes(self, exclude=tuple()):
|
||||||
transitions = []
|
|
||||||
serializers = {
|
serializers = {
|
||||||
'moves': lambda: srsly.json_dumps(self.labels),
|
'moves': lambda: srsly.json_dumps(self.labels),
|
||||||
'strings': lambda: self.strings.to_bytes(),
|
'strings': lambda: self.strings.to_bytes(),
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Callable, Iterable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Config, Model
|
from thinc.api import Config, Model
|
||||||
|
|
||||||
|
@ -124,6 +124,7 @@ def make_parser(
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"beam_parser",
|
"beam_parser",
|
||||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Callable, Dict, Optional, Union
|
from typing import Callable, Dict, Optional, Union
|
||||||
|
|
||||||
import srsly
|
|
||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||||
|
|
||||||
from ..morphology cimport Morphology
|
from ..morphology cimport Morphology
|
||||||
|
@ -14,10 +13,8 @@ from ..errors import Errors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..parts_of_speech import IDS as POS_IDS
|
from ..parts_of_speech import IDS as POS_IDS
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..symbols import POS
|
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .pipe import deserialize_config
|
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
|
||||||
# See #9050
|
# See #9050
|
||||||
|
@ -76,8 +73,11 @@ def morphologizer_score(examples, **kwargs):
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
results.update(Scorer.score_token_attr(examples, "pos", **kwargs))
|
||||||
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs))
|
||||||
results.update(Scorer.score_token_attr_per_feat(examples,
|
results.update(
|
||||||
"morph", getter=morph_key_getter, **kwargs))
|
Scorer.score_token_attr_per_feat(
|
||||||
|
examples, "morph", getter=morph_key_getter, **kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
@ -233,7 +233,6 @@ class Morphologizer(Tagger):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
|
||||||
cdef bint overwrite = self.cfg["overwrite"]
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
cdef bint extend = self.cfg["extend"]
|
cdef bint extend = self.cfg["extend"]
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
|
|
|
@ -4,13 +4,10 @@ from typing import Optional
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
|
from thinc.api import Config, CosineDistance, Model, set_dropout_rate, to_categorical
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..attrs import ID
|
||||||
|
|
||||||
from ..attrs import ID, POS
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples
|
||||||
from ._parser_internals import nonproj
|
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
|
@ -103,10 +100,9 @@ class MultitaskObjective(Tagger):
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
for i, eg in enumerate(examples):
|
for i, eg in enumerate(examples):
|
||||||
# Handles alignment for tokenization differences
|
# Handles alignment for tokenization differences
|
||||||
doc_annots = eg.get_aligned() # TODO
|
_doc_annots = eg.get_aligned() # TODO
|
||||||
for j in range(len(eg.predicted)):
|
for j in range(len(eg.predicted)):
|
||||||
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||||
label = self.make_label(j, tok_annots)
|
label = self.make_label(j, tok_annots)
|
||||||
|
@ -206,7 +202,6 @@ class ClozeMultitask(TrainablePipe):
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
validate_examples(examples, "ClozeMultitask.rehearse")
|
validate_examples(examples, "ClozeMultitask.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
predictions, bp_predictions = self.model.begin_update()
|
predictions, bp_predictions = self.model.begin_update()
|
||||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||||
bp_predictions(d_predictions)
|
bp_predictions(d_predictions)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Callable, Iterable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Config, Model
|
from thinc.api import Config, Model
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ from ._parser_internals.ner cimport BiluoPushDown
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser cimport Parser
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import PRFScore, get_ner_prf
|
from ..scorer import get_ner_prf
|
||||||
from ..training import remove_bilu_prefix
|
from ..training import remove_bilu_prefix
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
@ -100,6 +100,7 @@ def make_ner(
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"beam_ner",
|
"beam_ner",
|
||||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
import warnings
|
import warnings
|
||||||
from typing import Callable, Dict, Iterable, Iterator, Optional, Tuple, Union
|
from typing import Callable, Dict, Iterable, Iterator, Tuple, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ cdef class Pipe:
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
|
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="__call__", name=self.name))
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -59,7 +59,7 @@ cdef class Pipe:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, [doc], e)
|
error_handler(self.name, self, [doc], e)
|
||||||
|
|
||||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
|
||||||
"""Initialize the pipe. For non-trainable components, this method
|
"""Initialize the pipe. For non-trainable components, this method
|
||||||
is optional. For trainable components, which should inherit
|
is optional. For trainable components, which should inherit
|
||||||
from the subclass TrainablePipe, the provided data examples
|
from the subclass TrainablePipe, the provided data examples
|
||||||
|
|
|
@ -7,13 +7,13 @@ from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import Scorer
|
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
from .senter import senter_score
|
from .senter import senter_score
|
||||||
|
|
||||||
# see #9050
|
# see #9050
|
||||||
BACKWARD_OVERWRITE = False
|
BACKWARD_OVERWRITE = False
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"sentencizer",
|
"sentencizer",
|
||||||
assigns=["token.is_sent_start", "doc.sents"],
|
assigns=["token.is_sent_start", "doc.sents"],
|
||||||
|
@ -36,17 +36,19 @@ class Sentencizer(Pipe):
|
||||||
DOCS: https://spacy.io/api/sentencizer
|
DOCS: https://spacy.io/api/sentencizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
default_punct_chars = [
|
||||||
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
'!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||||
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||||
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||||
'꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
|
'‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
|
||||||
'﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
|
'꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
|
||||||
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
|
'﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
|
||||||
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
'𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
|
||||||
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
'𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
|
||||||
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
'𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
|
||||||
'。', '。']
|
'𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈',
|
||||||
|
'。', '。'
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -128,7 +130,6 @@ class Sentencizer(Pipe):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int idx = 0
|
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
|
@ -169,7 +170,6 @@ class Sentencizer(Pipe):
|
||||||
path = path.with_suffix(".json")
|
path = path.with_suffix(".json")
|
||||||
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
srsly.write_json(path, {"punct_chars": list(self.punct_chars), "overwrite": self.overwrite})
|
||||||
|
|
||||||
|
|
||||||
def from_disk(self, path, *, exclude=tuple()):
|
def from_disk(self, path, *, exclude=tuple()):
|
||||||
"""Load the sentencizer from disk.
|
"""Load the sentencizer from disk.
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import srsly
|
|
||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
|
@ -48,14 +48,14 @@ DEFAULT_SPAN_FINDER_MODEL = Config().from_str(span_finder_default_config)["model
|
||||||
"threshold": 0.5,
|
"threshold": 0.5,
|
||||||
"model": DEFAULT_SPAN_FINDER_MODEL,
|
"model": DEFAULT_SPAN_FINDER_MODEL,
|
||||||
"spans_key": DEFAULT_SPANS_KEY,
|
"spans_key": DEFAULT_SPANS_KEY,
|
||||||
"max_length": None,
|
"max_length": 25,
|
||||||
"min_length": None,
|
"min_length": None,
|
||||||
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
"scorer": {"@scorers": "spacy.span_finder_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
f"span_finder_{DEFAULT_SPANS_KEY}_f": 1.0,
|
f"spans_{DEFAULT_SPANS_KEY}_f": 1.0,
|
||||||
f"span_finder_{DEFAULT_SPANS_KEY}_p": 0.0,
|
f"spans_{DEFAULT_SPANS_KEY}_p": 0.0,
|
||||||
f"span_finder_{DEFAULT_SPANS_KEY}_r": 0.0,
|
f"spans_{DEFAULT_SPANS_KEY}_r": 0.0,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
def make_span_finder(
|
def make_span_finder(
|
||||||
|
@ -104,7 +104,7 @@ def make_span_finder_scorer():
|
||||||
|
|
||||||
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
def span_finder_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
kwargs = dict(kwargs)
|
kwargs = dict(kwargs)
|
||||||
attr_prefix = "span_finder_"
|
attr_prefix = "spans_"
|
||||||
key = kwargs["spans_key"]
|
key = kwargs["spans_key"]
|
||||||
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
kwargs.setdefault("attr", f"{attr_prefix}{key}")
|
||||||
kwargs.setdefault(
|
kwargs.setdefault(
|
||||||
|
|
|
@ -1,26 +1,18 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
import warnings
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
|
||||||
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
|
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, set_dropout_rate
|
||||||
from thinc.types import Floats2d
|
|
||||||
|
|
||||||
from ..morphology cimport Morphology
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..vocab cimport Vocab
|
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..attrs import ID, POS
|
from ..errors import Errors
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..parts_of_speech import X
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .pipe import deserialize_config
|
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
# See #9050
|
# See #9050
|
||||||
|
@ -169,7 +161,6 @@ class Tagger(TrainablePipe):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef Vocab vocab = self.vocab
|
|
||||||
cdef bint overwrite = self.cfg["overwrite"]
|
cdef bint overwrite = self.cfg["overwrite"]
|
||||||
labels = self.labels
|
labels = self.labels
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
|
|
@ -55,7 +55,7 @@ cdef class TrainablePipe(Pipe):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, [doc], e)
|
error_handler(self.name, self, [doc], e)
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
applied to the Doc.
|
applied to the Doc.
|
||||||
|
@ -102,9 +102,9 @@ cdef class TrainablePipe(Pipe):
|
||||||
def update(self,
|
def update(self,
|
||||||
examples: Iterable["Example"],
|
examples: Iterable["Example"],
|
||||||
*,
|
*,
|
||||||
drop: float=0.0,
|
drop: float = 0.0,
|
||||||
sgd: Optimizer=None,
|
sgd: Optimizer = None,
|
||||||
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
losses: Optional[Dict[str, float]] = None) -> Dict[str, float]:
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
updating the pipe's model. Delegates to predict and get_loss.
|
updating the pipe's model. Delegates to predict and get_loss.
|
||||||
|
|
||||||
|
@ -138,8 +138,8 @@ cdef class TrainablePipe(Pipe):
|
||||||
def rehearse(self,
|
def rehearse(self,
|
||||||
examples: Iterable[Example],
|
examples: Iterable[Example],
|
||||||
*,
|
*,
|
||||||
sgd: Optimizer=None,
|
sgd: Optimizer = None,
|
||||||
losses: Dict[str, float]=None,
|
losses: Dict[str, float] = None,
|
||||||
**config) -> Dict[str, float]:
|
**config) -> Dict[str, float]:
|
||||||
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
||||||
teach the current model to make predictions similar to an initial model,
|
teach the current model to make predictions similar to an initial model,
|
||||||
|
@ -177,7 +177,7 @@ cdef class TrainablePipe(Pipe):
|
||||||
"""
|
"""
|
||||||
return util.create_default_optimizer()
|
return util.create_default_optimizer()
|
||||||
|
|
||||||
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language = None):
|
||||||
"""Initialize the pipe for training, using data examples if available.
|
"""Initialize the pipe for training, using data examples if available.
|
||||||
This method needs to be implemented by each TrainablePipe component,
|
This method needs to be implemented by each TrainablePipe component,
|
||||||
ensuring the internal model (if available) is initialized properly
|
ensuring the internal model (if available) is initialized properly
|
||||||
|
|
|
@ -13,8 +13,18 @@ cdef class Parser(TrainablePipe):
|
||||||
cdef readonly TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
cdef public object _multitasks
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
cdef void _parseC(
|
||||||
WeightsC weights, SizesC sizes) nogil
|
self,
|
||||||
|
CBlas cblas,
|
||||||
|
StateC** states,
|
||||||
|
WeightsC weights,
|
||||||
|
SizesC sizes
|
||||||
|
) nogil
|
||||||
|
|
||||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
cdef void c_transition_batch(
|
||||||
int nr_class, int batch_size) nogil
|
self,
|
||||||
|
StateC** states,
|
||||||
|
const float* scores,
|
||||||
|
int nr_class,
|
||||||
|
int batch_size
|
||||||
|
) nogil
|
||||||
|
|
|
@ -7,20 +7,15 @@ from cymem.cymem cimport Pool
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
from libc.string cimport memcpy, memset
|
from libc.string cimport memset
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import srsly
|
|
||||||
from thinc.api import CupyOps, NumpyOps, get_ops, set_dropout_rate
|
|
||||||
|
|
||||||
from thinc.extra.search cimport Beam
|
|
||||||
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
import srsly
|
||||||
|
from thinc.api import CupyOps, NumpyOps, set_dropout_rate
|
||||||
|
|
||||||
from ..ml.parser_model cimport (
|
from ..ml.parser_model cimport (
|
||||||
ActivationsC,
|
ActivationsC,
|
||||||
|
@ -42,7 +37,7 @@ from .trainable_pipe import TrainablePipe
|
||||||
from ._parser_internals cimport _beam_utils
|
from ._parser_internals cimport _beam_utils
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
from ._parser_internals import _beam_utils
|
from ._parser_internals import _beam_utils
|
||||||
|
|
||||||
|
@ -258,7 +253,6 @@ cdef class Parser(TrainablePipe):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, batch_in_order, e)
|
error_handler(self.name, self, batch_in_order, e)
|
||||||
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
@ -300,8 +294,6 @@ cdef class Parser(TrainablePipe):
|
||||||
return batch
|
return batch
|
||||||
|
|
||||||
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
||||||
cdef Beam beam
|
|
||||||
cdef Doc doc
|
|
||||||
self._ensure_labels_are_added(docs)
|
self._ensure_labels_are_added(docs)
|
||||||
batch = _beam_utils.BeamBatch(
|
batch = _beam_utils.BeamBatch(
|
||||||
self.moves,
|
self.moves,
|
||||||
|
@ -321,16 +313,18 @@ cdef class Parser(TrainablePipe):
|
||||||
del model
|
del model
|
||||||
return list(batch)
|
return list(batch)
|
||||||
|
|
||||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
cdef void _parseC(
|
||||||
WeightsC weights, SizesC sizes) nogil:
|
self, CBlas cblas, StateC** states, WeightsC weights, SizesC sizes
|
||||||
cdef int i, j
|
) nogil:
|
||||||
|
cdef int i
|
||||||
cdef vector[StateC*] unfinished
|
cdef vector[StateC*] unfinished
|
||||||
cdef ActivationsC activations = alloc_activations(sizes)
|
cdef ActivationsC activations = alloc_activations(sizes)
|
||||||
while sizes.states >= 1:
|
while sizes.states >= 1:
|
||||||
predict_states(cblas, &activations, states, &weights, sizes)
|
predict_states(cblas, &activations, states, &weights, sizes)
|
||||||
# Validate actions, argmax, take action.
|
# Validate actions, argmax, take action.
|
||||||
self.c_transition_batch(states,
|
self.c_transition_batch(
|
||||||
activations.scores, sizes.classes, sizes.states)
|
states, activations.scores, sizes.classes, sizes.states
|
||||||
|
)
|
||||||
for i in range(sizes.states):
|
for i in range(sizes.states):
|
||||||
if not states[i].is_final():
|
if not states[i].is_final():
|
||||||
unfinished.push_back(states[i])
|
unfinished.push_back(states[i])
|
||||||
|
@ -342,7 +336,6 @@ cdef class Parser(TrainablePipe):
|
||||||
|
|
||||||
def set_annotations(self, docs, states_or_beams):
|
def set_annotations(self, docs, states_or_beams):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
cdef Beam beam
|
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
states = _beam_utils.collect_states(states_or_beams, docs)
|
states = _beam_utils.collect_states(states_or_beams, docs)
|
||||||
for i, (state, doc) in enumerate(zip(states, docs)):
|
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||||
|
@ -359,8 +352,13 @@ cdef class Parser(TrainablePipe):
|
||||||
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||||
return [state for state in states if not state.c.is_final()]
|
return [state for state in states if not state.c.is_final()]
|
||||||
|
|
||||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
cdef void c_transition_batch(
|
||||||
int nr_class, int batch_size) nogil:
|
self,
|
||||||
|
StateC** states,
|
||||||
|
const float* scores,
|
||||||
|
int nr_class,
|
||||||
|
int batch_size
|
||||||
|
) nogil:
|
||||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
with gil:
|
with gil:
|
||||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||||
|
@ -380,7 +378,6 @@ cdef class Parser(TrainablePipe):
|
||||||
free(is_valid)
|
free(is_valid)
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||||
cdef StateClass state
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
@ -419,8 +416,7 @@ cdef class Parser(TrainablePipe):
|
||||||
if not states:
|
if not states:
|
||||||
return losses
|
return losses
|
||||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
||||||
|
|
||||||
all_states = list(states)
|
|
||||||
states_golds = list(zip(states, golds))
|
states_golds = list(zip(states, golds))
|
||||||
n_moves = 0
|
n_moves = 0
|
||||||
while states_golds:
|
while states_golds:
|
||||||
|
@ -500,8 +496,16 @@ cdef class Parser(TrainablePipe):
|
||||||
del tutor
|
del tutor
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def update_beam(self, examples, *, beam_width,
|
def update_beam(
|
||||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
self,
|
||||||
|
examples,
|
||||||
|
*,
|
||||||
|
beam_width,
|
||||||
|
drop=0.,
|
||||||
|
sgd=None,
|
||||||
|
losses=None,
|
||||||
|
beam_density=0.0
|
||||||
|
):
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||||
if not states:
|
if not states:
|
||||||
return losses
|
return losses
|
||||||
|
@ -531,8 +535,9 @@ cdef class Parser(TrainablePipe):
|
||||||
|
|
||||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
cdef np.ndarray d_scores = numpy.zeros(
|
||||||
dtype='f', order='C')
|
(len(states), self.moves.n_moves), dtype='f', order='C'
|
||||||
|
)
|
||||||
c_d_scores = <float*>d_scores.data
|
c_d_scores = <float*>d_scores.data
|
||||||
unseen_classes = self.model.attrs["unseen_classes"]
|
unseen_classes = self.model.attrs["unseen_classes"]
|
||||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
for i, (state, gold) in enumerate(zip(states, golds)):
|
||||||
|
@ -542,8 +547,9 @@ cdef class Parser(TrainablePipe):
|
||||||
for j in range(self.moves.n_moves):
|
for j in range(self.moves.n_moves):
|
||||||
if costs[j] <= 0.0 and j in unseen_classes:
|
if costs[j] <= 0.0 and j in unseen_classes:
|
||||||
unseen_classes.remove(j)
|
unseen_classes.remove(j)
|
||||||
cpu_log_loss(c_d_scores,
|
cpu_log_loss(
|
||||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
c_d_scores, costs, is_valid, &scores[i, 0], d_scores.shape[1]
|
||||||
|
)
|
||||||
c_d_scores += d_scores.shape[1]
|
c_d_scores += d_scores.shape[1]
|
||||||
# Note that we don't normalize this. See comment in update() for why.
|
# Note that we don't normalize this. See comment in update() for why.
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from libcpp.set cimport set
|
|
||||||
from murmurhash.mrmr cimport hash32, hash64
|
from murmurhash.mrmr cimport hash32, hash64
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -20,9 +19,10 @@ cdef inline bint _try_coerce_to_hash(object key, hash_t* out_hash):
|
||||||
try:
|
try:
|
||||||
out_hash[0] = key
|
out_hash[0] = key
|
||||||
return True
|
return True
|
||||||
except:
|
except: # no-cython-lint
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_string_id(key):
|
def get_string_id(key):
|
||||||
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
"""Get a string ID, handling the reserved symbols correctly. If the key is
|
||||||
already an ID, return it.
|
already an ID, return it.
|
||||||
|
@ -87,7 +87,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
||||||
cdef int n_length_bytes
|
cdef int n_length_bytes
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
|
||||||
cdef uint32_t ulength = length
|
|
||||||
if length < sizeof(string.s):
|
if length < sizeof(string.s):
|
||||||
string.s[0] = <unsigned char>length
|
string.s[0] = <unsigned char>length
|
||||||
memcpy(&string.s[1], chars, length)
|
memcpy(&string.s[1], chars, length)
|
||||||
|
|
|
@ -52,7 +52,7 @@ cdef struct TokenC:
|
||||||
|
|
||||||
int sent_start
|
int sent_start
|
||||||
int ent_iob
|
int ent_iob
|
||||||
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
|
||||||
attr_t ent_kb_id
|
attr_t ent_kb_id
|
||||||
hash_t ent_id
|
hash_t ent_id
|
||||||
|
|
||||||
|
|
|
@ -92,7 +92,7 @@ cdef enum symbol_t:
|
||||||
ADV
|
ADV
|
||||||
AUX
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
CCONJ # U20
|
CCONJ # U20
|
||||||
DET
|
DET
|
||||||
INTJ
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
|
@ -418,7 +418,7 @@ cdef enum symbol_t:
|
||||||
ccomp
|
ccomp
|
||||||
complm
|
complm
|
||||||
conj
|
conj
|
||||||
cop # U20
|
cop # U20
|
||||||
csubj
|
csubj
|
||||||
csubjpass
|
csubjpass
|
||||||
dep
|
dep
|
||||||
|
@ -441,8 +441,8 @@ cdef enum symbol_t:
|
||||||
num
|
num
|
||||||
number
|
number
|
||||||
oprd
|
oprd
|
||||||
obj # U20
|
obj # U20
|
||||||
obl # U20
|
obl # U20
|
||||||
parataxis
|
parataxis
|
||||||
partmod
|
partmod
|
||||||
pcomp
|
pcomp
|
||||||
|
|
|
@ -96,7 +96,7 @@ IDS = {
|
||||||
"ADV": ADV,
|
"ADV": ADV,
|
||||||
"AUX": AUX,
|
"AUX": AUX,
|
||||||
"CONJ": CONJ,
|
"CONJ": CONJ,
|
||||||
"CCONJ": CCONJ, # U20
|
"CCONJ": CCONJ, # U20
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
"INTJ": INTJ,
|
"INTJ": INTJ,
|
||||||
"NOUN": NOUN,
|
"NOUN": NOUN,
|
||||||
|
@ -421,7 +421,7 @@ IDS = {
|
||||||
"ccomp": ccomp,
|
"ccomp": ccomp,
|
||||||
"complm": complm,
|
"complm": complm,
|
||||||
"conj": conj,
|
"conj": conj,
|
||||||
"cop": cop, # U20
|
"cop": cop, # U20
|
||||||
"csubj": csubj,
|
"csubj": csubj,
|
||||||
"csubjpass": csubjpass,
|
"csubjpass": csubjpass,
|
||||||
"dep": dep,
|
"dep": dep,
|
||||||
|
@ -444,8 +444,8 @@ IDS = {
|
||||||
"num": num,
|
"num": num,
|
||||||
"number": number,
|
"number": number,
|
||||||
"oprd": oprd,
|
"oprd": oprd,
|
||||||
"obj": obj, # U20
|
"obj": obj, # U20
|
||||||
"obl": obl, # U20
|
"obl": obl, # U20
|
||||||
"parataxis": parataxis,
|
"parataxis": parataxis,
|
||||||
"partmod": partmod,
|
"partmod": partmod,
|
||||||
"pcomp": pcomp,
|
"pcomp": pcomp,
|
||||||
|
|
|
@ -52,7 +52,8 @@ TEST_PATTERNS = [
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"pattern", [[{"XX": "y"}, {"LENGTH": "2"}, {"TEXT": {"IN": 5}}]]
|
"pattern",
|
||||||
|
[[{"XX": "y"}], [{"LENGTH": "2"}], [{"TEXT": {"IN": 5}}], [{"text": {"in": 6}}]],
|
||||||
)
|
)
|
||||||
def test_matcher_pattern_validation(en_vocab, pattern):
|
def test_matcher_pattern_validation(en_vocab, pattern):
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
|
|
|
@ -12,6 +12,7 @@ def test_build_dependencies():
|
||||||
"flake8",
|
"flake8",
|
||||||
"hypothesis",
|
"hypothesis",
|
||||||
"pre-commit",
|
"pre-commit",
|
||||||
|
"cython-lint",
|
||||||
"black",
|
"black",
|
||||||
"isort",
|
"isort",
|
||||||
"mypy",
|
"mypy",
|
||||||
|
|
|
@ -230,10 +230,10 @@ def test_overfitting_IO():
|
||||||
|
|
||||||
# Test scoring
|
# Test scoring
|
||||||
scores = nlp.evaluate(train_examples)
|
scores = nlp.evaluate(train_examples)
|
||||||
assert f"span_finder_{SPANS_KEY}_f" in scores
|
assert f"spans_{SPANS_KEY}_f" in scores
|
||||||
# It's not perfect 1.0 F1 because it's designed to overgenerate for now.
|
# It's not perfect 1.0 F1 because it's designed to overgenerate for now.
|
||||||
assert scores[f"span_finder_{SPANS_KEY}_p"] == 0.75
|
assert scores[f"spans_{SPANS_KEY}_p"] == 0.75
|
||||||
assert scores[f"span_finder_{SPANS_KEY}_r"] == 1.0
|
assert scores[f"spans_{SPANS_KEY}_r"] == 1.0
|
||||||
|
|
||||||
# also test that the spancat works for just a single entity in a sentence
|
# also test that the spancat works for just a single entity in a sentence
|
||||||
doc = nlp("London")
|
doc = nlp("London")
|
||||||
|
|
|
@ -192,8 +192,7 @@ def test_tok2vec_listener(with_vectors):
|
||||||
for tag in t[1]["tags"]:
|
for tag in t[1]["tags"]:
|
||||||
tagger.add_label(tag)
|
tagger.add_label(tag)
|
||||||
|
|
||||||
# Check that the Tok2Vec component finds it listeners
|
# Check that the Tok2Vec component finds its listeners
|
||||||
assert tok2vec.listeners == []
|
|
||||||
optimizer = nlp.initialize(lambda: train_examples)
|
optimizer = nlp.initialize(lambda: train_examples)
|
||||||
assert tok2vec.listeners == [tagger_tok2vec]
|
assert tok2vec.listeners == [tagger_tok2vec]
|
||||||
|
|
||||||
|
@ -221,7 +220,6 @@ def test_tok2vec_listener_callback():
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
tagger = nlp.get_pipe("tagger")
|
tagger = nlp.get_pipe("tagger")
|
||||||
tok2vec = nlp.get_pipe("tok2vec")
|
tok2vec = nlp.get_pipe("tok2vec")
|
||||||
nlp._link_components()
|
|
||||||
docs = [nlp.make_doc("A random sentence")]
|
docs = [nlp.make_doc("A random sentence")]
|
||||||
tok2vec.model.initialize(X=docs)
|
tok2vec.model.initialize(X=docs)
|
||||||
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
|
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
|
||||||
|
@ -430,29 +428,46 @@ def test_replace_listeners_from_config():
|
||||||
nlp.to_disk(dir_path)
|
nlp.to_disk(dir_path)
|
||||||
base_model = str(dir_path)
|
base_model = str(dir_path)
|
||||||
new_config = {
|
new_config = {
|
||||||
"nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]},
|
"nlp": {
|
||||||
|
"lang": "en",
|
||||||
|
"pipeline": ["tok2vec", "tagger2", "ner3", "tagger4"],
|
||||||
|
},
|
||||||
"components": {
|
"components": {
|
||||||
"tok2vec": {"source": base_model},
|
"tok2vec": {"source": base_model},
|
||||||
"tagger": {
|
"tagger2": {
|
||||||
"source": base_model,
|
"source": base_model,
|
||||||
|
"component": "tagger",
|
||||||
"replace_listeners": ["model.tok2vec"],
|
"replace_listeners": ["model.tok2vec"],
|
||||||
},
|
},
|
||||||
"ner": {"source": base_model},
|
"ner3": {
|
||||||
|
"source": base_model,
|
||||||
|
"component": "ner",
|
||||||
|
},
|
||||||
|
"tagger4": {
|
||||||
|
"source": base_model,
|
||||||
|
"component": "tagger",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
new_nlp = util.load_model_from_config(new_config, auto_fill=True)
|
new_nlp = util.load_model_from_config(new_config, auto_fill=True)
|
||||||
new_nlp.initialize(lambda: examples)
|
new_nlp.initialize(lambda: examples)
|
||||||
tok2vec = new_nlp.get_pipe("tok2vec")
|
tok2vec = new_nlp.get_pipe("tok2vec")
|
||||||
tagger = new_nlp.get_pipe("tagger")
|
tagger = new_nlp.get_pipe("tagger2")
|
||||||
ner = new_nlp.get_pipe("ner")
|
ner = new_nlp.get_pipe("ner3")
|
||||||
assert tok2vec.listening_components == ["ner"]
|
assert "ner" not in new_nlp.pipe_names
|
||||||
|
assert "tagger" not in new_nlp.pipe_names
|
||||||
|
assert tok2vec.listening_components == ["ner3", "tagger4"]
|
||||||
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
|
assert any(isinstance(node, Tok2VecListener) for node in ner.model.walk())
|
||||||
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
|
assert not any(isinstance(node, Tok2VecListener) for node in tagger.model.walk())
|
||||||
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
|
t2v_cfg = new_nlp.config["components"]["tok2vec"]["model"]
|
||||||
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
|
assert t2v_cfg["@architectures"] == "spacy.Tok2Vec.v2"
|
||||||
assert new_nlp.config["components"]["tagger"]["model"]["tok2vec"] == t2v_cfg
|
assert new_nlp.config["components"]["tagger2"]["model"]["tok2vec"] == t2v_cfg
|
||||||
assert (
|
assert (
|
||||||
new_nlp.config["components"]["ner"]["model"]["tok2vec"]["@architectures"]
|
new_nlp.config["components"]["ner3"]["model"]["tok2vec"]["@architectures"]
|
||||||
|
== "spacy.Tok2VecListener.v1"
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
new_nlp.config["components"]["tagger4"]["model"]["tok2vec"]["@architectures"]
|
||||||
== "spacy.Tok2VecListener.v1"
|
== "spacy.Tok2VecListener.v1"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -544,3 +559,57 @@ def test_tok2vec_listeners_textcat():
|
||||||
assert cats1["imperative"] < 0.9
|
assert cats1["imperative"] < 0.9
|
||||||
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
|
assert [t.tag_ for t in docs[0]] == ["V", "J", "N"]
|
||||||
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
|
assert [t.tag_ for t in docs[1]] == ["N", "V", "J", "N"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_listener_source_link_name():
|
||||||
|
"""The component's internal name and the tok2vec listener map correspond
|
||||||
|
to the most recently modified pipeline.
|
||||||
|
"""
|
||||||
|
orig_config = Config().from_str(cfg_string_multi)
|
||||||
|
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
|
||||||
|
|
||||||
|
nlp2 = English()
|
||||||
|
nlp2.add_pipe("tok2vec", source=nlp1)
|
||||||
|
nlp2.add_pipe("tagger", name="tagger2", source=nlp1)
|
||||||
|
|
||||||
|
# there is no way to have the component have the right name for both
|
||||||
|
# pipelines, right now the most recently modified pipeline is prioritized
|
||||||
|
assert nlp1.get_pipe("tagger").name == nlp2.get_pipe("tagger2").name == "tagger2"
|
||||||
|
|
||||||
|
# there is no way to have the tok2vec have the right listener map for both
|
||||||
|
# pipelines, right now the most recently modified pipeline is prioritized
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
|
||||||
|
nlp2.add_pipe("ner", name="ner3", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2", "ner3"]
|
||||||
|
nlp2.remove_pipe("ner3")
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["tagger2"]
|
||||||
|
nlp2.remove_pipe("tagger2")
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == []
|
||||||
|
|
||||||
|
# at this point the tok2vec component corresponds to nlp2
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == []
|
||||||
|
|
||||||
|
# modifying the nlp1 pipeline syncs the tok2vec listener map back to nlp1
|
||||||
|
nlp1.add_pipe("sentencizer")
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
|
||||||
|
|
||||||
|
# modifying nlp2 syncs it back to nlp2
|
||||||
|
nlp2.add_pipe("sentencizer")
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_listener_source_replace_listeners():
|
||||||
|
orig_config = Config().from_str(cfg_string_multi)
|
||||||
|
nlp1 = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["tagger", "ner"]
|
||||||
|
nlp1.replace_listeners("tok2vec", "tagger", ["model.tok2vec"])
|
||||||
|
assert nlp1.get_pipe("tok2vec").listening_components == ["ner"]
|
||||||
|
|
||||||
|
nlp2 = English()
|
||||||
|
nlp2.add_pipe("tok2vec", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == []
|
||||||
|
nlp2.add_pipe("tagger", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == []
|
||||||
|
nlp2.add_pipe("ner", name="ner2", source=nlp1)
|
||||||
|
assert nlp2.get_pipe("tok2vec").listening_components == ["ner2"]
|
||||||
|
|
|
@ -13,6 +13,7 @@ from spacy.ml.models import (
|
||||||
build_Tok2Vec_model,
|
build_Tok2Vec_model,
|
||||||
)
|
)
|
||||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||||
|
from spacy.training import Example
|
||||||
from spacy.util import (
|
from spacy.util import (
|
||||||
load_config,
|
load_config,
|
||||||
load_config_from_str,
|
load_config_from_str,
|
||||||
|
@ -422,6 +423,55 @@ def test_config_overrides():
|
||||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore:\\[W036")
|
||||||
|
def test_config_overrides_registered_functions():
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
nlp.add_pipe("attribute_ruler")
|
||||||
|
with make_tempdir() as d:
|
||||||
|
nlp.to_disk(d)
|
||||||
|
nlp_re1 = spacy.load(
|
||||||
|
d,
|
||||||
|
config={
|
||||||
|
"components": {
|
||||||
|
"attribute_ruler": {
|
||||||
|
"scorer": {"@scorers": "spacy.tagger_scorer.v1"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
nlp_re1.config["components"]["attribute_ruler"]["scorer"]["@scorers"]
|
||||||
|
== "spacy.tagger_scorer.v1"
|
||||||
|
)
|
||||||
|
|
||||||
|
@registry.misc("test_some_other_key")
|
||||||
|
def misc_some_other_key():
|
||||||
|
return "some_other_key"
|
||||||
|
|
||||||
|
nlp_re2 = spacy.load(
|
||||||
|
d,
|
||||||
|
config={
|
||||||
|
"components": {
|
||||||
|
"attribute_ruler": {
|
||||||
|
"scorer": {
|
||||||
|
"@scorers": "spacy.overlapping_labeled_spans_scorer.v1",
|
||||||
|
"spans_key": {"@misc": "test_some_other_key"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert nlp_re2.config["components"]["attribute_ruler"]["scorer"][
|
||||||
|
"spans_key"
|
||||||
|
] == {"@misc": "test_some_other_key"}
|
||||||
|
# run dummy evaluation (will return None scores) in order to test that
|
||||||
|
# the spans_key value in the nested override is working as intended in
|
||||||
|
# the config
|
||||||
|
example = Example.from_dict(nlp_re2.make_doc("a b c"), {})
|
||||||
|
scores = nlp_re2.evaluate([example])
|
||||||
|
assert "spans_some_other_key_f" in scores
|
||||||
|
|
||||||
|
|
||||||
def test_config_interpolation():
|
def test_config_interpolation():
|
||||||
config = Config().from_str(nlp_config_string, interpolate=False)
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
||||||
|
|
|
@ -697,7 +697,6 @@ def test_string_to_list_intify(value):
|
||||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Temporarily skip before models are published")
|
|
||||||
def test_download_compatibility():
|
def test_download_compatibility():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
@ -708,7 +707,6 @@ def test_download_compatibility():
|
||||||
assert get_minor_version(about.__version__) == get_minor_version(version)
|
assert get_minor_version(about.__version__) == get_minor_version(version)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Temporarily skip before models are published")
|
|
||||||
def test_validate_compatibility_table():
|
def test_validate_compatibility_table():
|
||||||
spec = SpecifierSet("==" + about.__version__)
|
spec = SpecifierSet("==" + about.__version__)
|
||||||
spec.prereleases = False
|
spec.prereleases = False
|
||||||
|
|
|
@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities():
|
||||||
|
|
||||||
html = displacy.render(doc, style="ent", manual=True)
|
html = displacy.render(doc, style="ent", manual=True)
|
||||||
assert html.find("FIRST") < html.find("SECOND")
|
assert html.find("FIRST") < html.find("SECOND")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(12816)
|
||||||
|
def test_issue12816(en_vocab) -> None:
|
||||||
|
"""Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
|
||||||
|
# Create a doc containing an annotated word and an unannotated HTML tag
|
||||||
|
doc = Doc(en_vocab, words=["test", "<TEST>"])
|
||||||
|
doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
|
||||||
|
|
||||||
|
# Verify that the HTML tag is escaped when unannotated
|
||||||
|
html = displacy.render(doc, style="span")
|
||||||
|
assert "<TEST>" in html
|
||||||
|
|
||||||
|
# Annotate the HTML tag
|
||||||
|
doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
|
||||||
|
|
||||||
|
# Verify that the HTML tag is still escaped
|
||||||
|
html = displacy.render(doc, style="span")
|
||||||
|
assert "<TEST>" in html
|
||||||
|
|
|
@ -252,6 +252,10 @@ def test_minor_version(a1, a2, b1, b2, is_match):
|
||||||
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
||||||
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
||||||
),
|
),
|
||||||
|
(
|
||||||
|
{"attribute_ruler.scorer.@scorers": "spacy.tagger_scorer.v1"},
|
||||||
|
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
|
||||||
|
),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_dot_to_dict(dot_notation, expected):
|
def test_dot_to_dict(dot_notation, expected):
|
||||||
|
@ -260,6 +264,29 @@ def test_dot_to_dict(dot_notation, expected):
|
||||||
assert util.dict_to_dot(result) == dot_notation
|
assert util.dict_to_dot(result) == dot_notation
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"dot_notation,expected",
|
||||||
|
[
|
||||||
|
(
|
||||||
|
{"token.pos": True, "token._.xyz": True},
|
||||||
|
{"token": {"pos": True, "_": {"xyz": True}}},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"training.batch_size": 128, "training.optimizer.learn_rate": 0.01},
|
||||||
|
{"training": {"batch_size": 128, "optimizer": {"learn_rate": 0.01}}},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{"attribute_ruler.scorer": {"@scorers": "spacy.tagger_scorer.v1"}},
|
||||||
|
{"attribute_ruler": {"scorer": {"@scorers": "spacy.tagger_scorer.v1"}}},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_dot_to_dict_overrides(dot_notation, expected):
|
||||||
|
result = util.dot_to_dict(dot_notation)
|
||||||
|
assert result == expected
|
||||||
|
assert util.dict_to_dot(result, for_overrides=True) == dot_notation
|
||||||
|
|
||||||
|
|
||||||
def test_set_dot_to_object():
|
def test_set_dot_to_object():
|
||||||
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
|
config = {"foo": {"bar": 1, "baz": {"x": "y"}}, "test": {"a": {"b": "c"}}}
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
|
|
|
@ -402,6 +402,7 @@ def test_vectors_serialize():
|
||||||
row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
|
row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
|
||||||
assert row == row_r
|
assert row == row_r
|
||||||
assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
|
assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
|
||||||
|
assert v.attr == v_r.attr
|
||||||
|
|
||||||
|
|
||||||
def test_vector_is_oov():
|
def test_vector_is_oov():
|
||||||
|
@ -646,3 +647,32 @@ def test_equality():
|
||||||
vectors1.resize((5, 9))
|
vectors1.resize((5, 9))
|
||||||
vectors2.resize((5, 9))
|
vectors2.resize((5, 9))
|
||||||
assert vectors1 == vectors2
|
assert vectors1 == vectors2
|
||||||
|
|
||||||
|
|
||||||
|
def test_vectors_attr():
|
||||||
|
data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
|
||||||
|
# default ORTH
|
||||||
|
nlp = English()
|
||||||
|
nlp.vocab.vectors = Vectors(data=data, keys=["A", "B", "C"])
|
||||||
|
assert nlp.vocab.strings["A"] in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab.strings["a"] not in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab["A"].has_vector is True
|
||||||
|
assert nlp.vocab["a"].has_vector is False
|
||||||
|
assert nlp("A")[0].has_vector is True
|
||||||
|
assert nlp("a")[0].has_vector is False
|
||||||
|
|
||||||
|
# custom LOWER
|
||||||
|
nlp = English()
|
||||||
|
nlp.vocab.vectors = Vectors(data=data, keys=["a", "b", "c"], attr="LOWER")
|
||||||
|
assert nlp.vocab.strings["A"] not in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab.strings["a"] in nlp.vocab.vectors.key2row
|
||||||
|
assert nlp.vocab["A"].has_vector is True
|
||||||
|
assert nlp.vocab["a"].has_vector is True
|
||||||
|
assert nlp("A")[0].has_vector is True
|
||||||
|
assert nlp("a")[0].has_vector is True
|
||||||
|
# add a new vectors entry
|
||||||
|
assert nlp.vocab["D"].has_vector is False
|
||||||
|
assert nlp.vocab["d"].has_vector is False
|
||||||
|
nlp.vocab.set_vector("D", numpy.asarray([4, 5, 6]))
|
||||||
|
assert nlp.vocab["D"].has_vector is True
|
||||||
|
assert nlp.vocab["d"].has_vector is True
|
||||||
|
|
|
@ -31,24 +31,58 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
cdef void _filter_special_spans(self, vector[SpanC] &original,
|
cdef void _filter_special_spans(
|
||||||
vector[SpanC] &filtered, int doc_len) nogil
|
self,
|
||||||
cdef object _prepare_special_spans(self, Doc doc,
|
vector[SpanC] &original,
|
||||||
vector[SpanC] &filtered)
|
vector[SpanC] &filtered,
|
||||||
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
|
int doc_len,
|
||||||
object span_data)
|
) nogil
|
||||||
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
|
cdef object _prepare_special_spans(
|
||||||
int* has_special,
|
self,
|
||||||
bint with_special_cases) except -1
|
Doc doc,
|
||||||
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
|
vector[SpanC] &filtered,
|
||||||
int* has_special, bint with_special_cases) except -1
|
)
|
||||||
cdef str _split_affixes(self, Pool mem, str string,
|
cdef int _retokenize_special_spans(
|
||||||
vector[LexemeC*] *prefixes,
|
self,
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
Doc doc,
|
||||||
bint with_special_cases)
|
TokenC* tokens,
|
||||||
cdef int _attach_tokens(self, Doc tokens, str string,
|
object span_data,
|
||||||
vector[LexemeC*] *prefixes,
|
)
|
||||||
vector[LexemeC*] *suffixes, int* has_special,
|
cdef int _try_specials_and_cache(
|
||||||
bint with_special_cases) except -1
|
self,
|
||||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
|
hash_t key,
|
||||||
int* has_special, int n) except -1
|
Doc tokens,
|
||||||
|
int* has_special,
|
||||||
|
bint with_special_cases,
|
||||||
|
) except -1
|
||||||
|
cdef int _tokenize(
|
||||||
|
self,
|
||||||
|
Doc tokens,
|
||||||
|
str span,
|
||||||
|
hash_t key,
|
||||||
|
int* has_special,
|
||||||
|
bint with_special_cases,
|
||||||
|
) except -1
|
||||||
|
cdef str _split_affixes(
|
||||||
|
self,
|
||||||
|
Pool mem,
|
||||||
|
str string,
|
||||||
|
vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
|
bint with_special_cases,
|
||||||
|
)
|
||||||
|
cdef int _attach_tokens(
|
||||||
|
self,
|
||||||
|
Doc tokens,
|
||||||
|
str string,
|
||||||
|
vector[LexemeC*] *prefixes,
|
||||||
|
vector[LexemeC*] *suffixes, int* has_special,
|
||||||
|
bint with_special_cases,
|
||||||
|
) except -1
|
||||||
|
cdef int _save_cached(
|
||||||
|
self,
|
||||||
|
const TokenC* tokens,
|
||||||
|
hash_t key,
|
||||||
|
int* has_special,
|
||||||
|
int n,
|
||||||
|
) except -1
|
||||||
|
|
|
@ -8,20 +8,18 @@ from libcpp.set cimport set as stdset
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import warnings
|
|
||||||
|
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
from .symbols import NORM, ORTH
|
from .symbols import NORM, ORTH
|
||||||
from .tokens import Span
|
from .tokens import Span
|
||||||
from .training import validate_examples
|
from .training import validate_examples
|
||||||
from .util import get_words_and_spaces, registry
|
from .util import get_words_and_spaces
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
@ -324,7 +322,7 @@ cdef class Tokenizer:
|
||||||
cdef int span_start
|
cdef int span_start
|
||||||
cdef int span_end
|
cdef int span_end
|
||||||
while i < doc.length:
|
while i < doc.length:
|
||||||
if not i in span_data:
|
if i not in span_data:
|
||||||
tokens[i + offset] = doc.c[i]
|
tokens[i + offset] = doc.c[i]
|
||||||
i += 1
|
i += 1
|
||||||
else:
|
else:
|
||||||
|
@ -395,12 +393,15 @@ cdef class Tokenizer:
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
cdef str _split_affixes(self, Pool mem, str string,
|
cdef str _split_affixes(
|
||||||
vector[const LexemeC*] *prefixes,
|
self,
|
||||||
vector[const LexemeC*] *suffixes,
|
Pool mem,
|
||||||
int* has_special,
|
str string,
|
||||||
bint with_special_cases):
|
vector[const LexemeC*] *prefixes,
|
||||||
cdef size_t i
|
vector[const LexemeC*] *suffixes,
|
||||||
|
int* has_special,
|
||||||
|
bint with_special_cases
|
||||||
|
):
|
||||||
cdef str prefix
|
cdef str prefix
|
||||||
cdef str suffix
|
cdef str suffix
|
||||||
cdef str minus_pre
|
cdef str minus_pre
|
||||||
|
@ -445,10 +446,6 @@ cdef class Tokenizer:
|
||||||
vector[const LexemeC*] *suffixes,
|
vector[const LexemeC*] *suffixes,
|
||||||
int* has_special,
|
int* has_special,
|
||||||
bint with_special_cases) except -1:
|
bint with_special_cases) except -1:
|
||||||
cdef bint specials_hit = 0
|
|
||||||
cdef bint cache_hit = 0
|
|
||||||
cdef int split, end
|
|
||||||
cdef const LexemeC* const* lexemes
|
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
cdef str span
|
cdef str span
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -458,9 +455,11 @@ cdef class Tokenizer:
|
||||||
if string:
|
if string:
|
||||||
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
|
||||||
pass
|
pass
|
||||||
elif (self.token_match and self.token_match(string)) or \
|
elif (
|
||||||
(self.url_match and \
|
(self.token_match and self.token_match(string)) or
|
||||||
self.url_match(string)):
|
(self.url_match and self.url_match(string))
|
||||||
|
):
|
||||||
|
|
||||||
# We're always saying 'no' to spaces here -- the caller will
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
# fix up the outermost one, with reference to the original.
|
# fix up the outermost one, with reference to the original.
|
||||||
# See Issue #859
|
# See Issue #859
|
||||||
|
@ -821,7 +820,7 @@ cdef class Tokenizer:
|
||||||
self.infix_finditer = None
|
self.infix_finditer = None
|
||||||
self.token_match = None
|
self.token_match = None
|
||||||
self.url_match = None
|
self.url_match = None
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
util.from_bytes(bytes_data, deserializers, exclude)
|
||||||
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
if "prefix_search" in data and isinstance(data["prefix_search"], str):
|
||||||
self.prefix_search = re.compile(data["prefix_search"]).search
|
self.prefix_search = re.compile(data["prefix_search"]).search
|
||||||
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
if "suffix_search" in data and isinstance(data["suffix_search"], str):
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# cython: infer_types=True, bounds_check=False, profile=True
|
# cython: infer_types=True, bounds_check=False, profile=True
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdlib cimport free, malloc
|
from libc.string cimport memset
|
||||||
from libc.string cimport memcpy, memset
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
@ -10,7 +9,7 @@ from ..attrs cimport MORPH, NORM
|
||||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
||||||
from ..structs cimport LexemeC, TokenC
|
from ..structs cimport LexemeC, TokenC
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .doc cimport Doc, set_children_from_heads, token_by_end, token_by_start
|
from .doc cimport Doc, set_children_from_heads, token_by_start
|
||||||
from .span cimport Span
|
from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
|
||||||
|
@ -147,7 +146,7 @@ def _merge(Doc doc, merges):
|
||||||
syntactic root of the span.
|
syntactic root of the span.
|
||||||
RETURNS (Token): The first newly merged token.
|
RETURNS (Token): The first newly merged token.
|
||||||
"""
|
"""
|
||||||
cdef int i, merge_index, start, end, token_index, current_span_index, current_offset, offset, span_index
|
cdef int i, merge_index, start, token_index, current_span_index, current_offset, offset, span_index
|
||||||
cdef Span span
|
cdef Span span
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef TokenC* token
|
cdef TokenC* token
|
||||||
|
@ -165,7 +164,6 @@ def _merge(Doc doc, merges):
|
||||||
merges.sort(key=_get_start)
|
merges.sort(key=_get_start)
|
||||||
for merge_index, (span, attributes) in enumerate(merges):
|
for merge_index, (span, attributes) in enumerate(merges):
|
||||||
start = span.start
|
start = span.start
|
||||||
end = span.end
|
|
||||||
spans.append(span)
|
spans.append(span)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
token = &doc.c[start]
|
token = &doc.c[start]
|
||||||
|
@ -203,8 +201,9 @@ def _merge(Doc doc, merges):
|
||||||
# for the merged region. To do this, we create a boolean array indicating
|
# for the merged region. To do this, we create a boolean array indicating
|
||||||
# whether the row is to be deleted, then use numpy.delete
|
# whether the row is to be deleted, then use numpy.delete
|
||||||
if doc.tensor is not None and doc.tensor.size != 0:
|
if doc.tensor is not None and doc.tensor.size != 0:
|
||||||
doc.tensor = _resize_tensor(doc.tensor,
|
doc.tensor = _resize_tensor(
|
||||||
[(m[0].start, m[0].end) for m in merges])
|
doc.tensor, [(m[0].start, m[0].end) for m in merges]
|
||||||
|
)
|
||||||
# Memorize span roots and sets dependencies of the newly merged
|
# Memorize span roots and sets dependencies of the newly merged
|
||||||
# tokens to the dependencies of their roots.
|
# tokens to the dependencies of their roots.
|
||||||
span_roots = []
|
span_roots = []
|
||||||
|
@ -267,11 +266,11 @@ def _merge(Doc doc, merges):
|
||||||
span_index += 1
|
span_index += 1
|
||||||
if span_index < len(spans) and i == spans[span_index].start:
|
if span_index < len(spans) and i == spans[span_index].start:
|
||||||
# First token in a span
|
# First token in a span
|
||||||
doc.c[i - offset] = doc.c[i] # move token to its place
|
doc.c[i - offset] = doc.c[i] # move token to its place
|
||||||
offset += (spans[span_index].end - spans[span_index].start) - 1
|
offset += (spans[span_index].end - spans[span_index].start) - 1
|
||||||
in_span = True
|
in_span = True
|
||||||
if not in_span:
|
if not in_span:
|
||||||
doc.c[i - offset] = doc.c[i] # move token to its place
|
doc.c[i - offset] = doc.c[i] # move token to its place
|
||||||
|
|
||||||
for i in range(doc.length - offset, doc.length):
|
for i in range(doc.length - offset, doc.length):
|
||||||
memset(&doc.c[i], 0, sizeof(TokenC))
|
memset(&doc.c[i], 0, sizeof(TokenC))
|
||||||
|
@ -345,7 +344,11 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
if to_process_tensor:
|
if to_process_tensor:
|
||||||
xp = get_array_module(doc.tensor)
|
xp = get_array_module(doc.tensor)
|
||||||
if xp is numpy:
|
if xp is numpy:
|
||||||
doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
|
doc.tensor = xp.append(
|
||||||
|
doc.tensor,
|
||||||
|
xp.zeros((nb_subtokens, doc.tensor.shape[1]), dtype="float32"),
|
||||||
|
axis=0
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
|
shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
|
||||||
resized_array = xp.zeros(shape, dtype="float32")
|
resized_array = xp.zeros(shape, dtype="float32")
|
||||||
|
@ -367,7 +370,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
token.norm = 0 # reset norm
|
token.norm = 0 # reset norm
|
||||||
if to_process_tensor:
|
if to_process_tensor:
|
||||||
# setting the tensors of the split tokens to array of zeros
|
# setting the tensors of the split tokens to array of zeros
|
||||||
doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
|
doc.tensor[token_index + i:token_index + i + 1] = \
|
||||||
|
xp.zeros((1, doc.tensor.shape[1]), dtype="float32")
|
||||||
# Update the character offset of the subtokens
|
# Update the character offset of the subtokens
|
||||||
if i != 0:
|
if i != 0:
|
||||||
token.idx = orig_token.idx + idx_offset
|
token.idx = orig_token.idx + idx_offset
|
||||||
|
@ -455,7 +459,6 @@ def normalize_token_attrs(Vocab vocab, attrs):
|
||||||
def set_token_attrs(Token py_token, attrs):
|
def set_token_attrs(Token py_token, attrs):
|
||||||
cdef TokenC* token = py_token.c
|
cdef TokenC* token = py_token.c
|
||||||
cdef const LexemeC* lex = token.lex
|
cdef const LexemeC* lex = token.lex
|
||||||
cdef Doc doc = py_token.doc
|
|
||||||
# Assign attributes
|
# Assign attributes
|
||||||
for attr_name, attr_value in attrs.items():
|
for attr_name, attr_value in attrs.items():
|
||||||
if attr_name == "_": # Set extension attributes
|
if attr_name == "_": # Set extension attributes
|
||||||
|
|
|
@ -31,7 +31,7 @@ cdef int token_by_start(const TokenC* tokens, int length, int start_char) except
|
||||||
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2
|
||||||
|
|
||||||
|
|
||||||
cdef int [:,:] _get_lca_matrix(Doc, int start, int end)
|
cdef int [:, :] _get_lca_matrix(Doc, int start, int end)
|
||||||
|
|
||||||
|
|
||||||
cdef class Doc:
|
cdef class Doc:
|
||||||
|
@ -61,7 +61,6 @@ cdef class Doc:
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
|
|
||||||
cdef public object noun_chunks_iterator
|
cdef public object noun_chunks_iterator
|
||||||
|
|
||||||
cdef object __weakref__
|
cdef object __weakref__
|
||||||
|
|
|
@ -35,6 +35,7 @@ from ..attrs cimport (
|
||||||
LENGTH,
|
LENGTH,
|
||||||
MORPH,
|
MORPH,
|
||||||
NORM,
|
NORM,
|
||||||
|
ORTH,
|
||||||
POS,
|
POS,
|
||||||
SENT_START,
|
SENT_START,
|
||||||
SPACY,
|
SPACY,
|
||||||
|
@ -42,14 +43,13 @@ from ..attrs cimport (
|
||||||
attr_id_t,
|
attr_id_t,
|
||||||
)
|
)
|
||||||
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
from ..lexeme cimport EMPTY_LEXEME, Lexeme
|
||||||
from ..typedefs cimport attr_t, flags_t
|
from ..typedefs cimport attr_t
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
|
|
||||||
from .. import parts_of_speech, schemas, util
|
from .. import parts_of_speech, schemas, util
|
||||||
from ..attrs import IDS, intify_attr
|
from ..attrs import IDS, intify_attr
|
||||||
from ..compat import copy_reg, pickle
|
from ..compat import copy_reg
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..morphology import Morphology
|
|
||||||
from ..util import get_words_and_spaces
|
from ..util import get_words_and_spaces
|
||||||
from ._retokenize import Retokenizer
|
from ._retokenize import Retokenizer
|
||||||
from .underscore import Underscore, get_ext_args
|
from .underscore import Underscore, get_ext_args
|
||||||
|
@ -613,13 +613,26 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.user_hooks:
|
if "similarity" in self.user_hooks:
|
||||||
return self.user_hooks["similarity"](self, other)
|
return self.user_hooks["similarity"](self, other)
|
||||||
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
attr = getattr(self.vocab.vectors, "attr", ORTH)
|
||||||
if self.c[0].lex.orth == other.orth:
|
cdef Token this_token
|
||||||
|
cdef Token other_token
|
||||||
|
cdef Lexeme other_lex
|
||||||
|
if len(self) == 1 and isinstance(other, Token):
|
||||||
|
this_token = self[0]
|
||||||
|
other_token = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
elif isinstance(other, (Span, Doc)) and len(self) == len(other):
|
elif len(self) == 1 and isinstance(other, Lexeme):
|
||||||
|
this_token = self[0]
|
||||||
|
other_lex = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
|
||||||
|
return 1.0
|
||||||
|
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
||||||
similar = True
|
similar = True
|
||||||
for i in range(self.length):
|
for i in range(len(self)):
|
||||||
if self[i].orth != other[i].orth:
|
this_token = self[i]
|
||||||
|
other_token = other[i]
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
|
||||||
similar = False
|
similar = False
|
||||||
break
|
break
|
||||||
if similar:
|
if similar:
|
||||||
|
@ -770,7 +783,7 @@ cdef class Doc:
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1. Test basic data-driven ORTH gazetteer
|
# 1. Test basic data-driven ORTH gazetteer
|
||||||
# 2. Test more nuanced date and currency regex
|
# 2. Test more nuanced date and currency regex
|
||||||
cdef attr_t entity_type, kb_id, ent_id
|
cdef attr_t kb_id, ent_id
|
||||||
cdef int ent_start, ent_end
|
cdef int ent_start, ent_end
|
||||||
ent_spans = []
|
ent_spans = []
|
||||||
for ent_info in ents:
|
for ent_info in ents:
|
||||||
|
@ -973,7 +986,6 @@ cdef class Doc:
|
||||||
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||||
"""
|
"""
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
|
||||||
cdef np.ndarray[attr_t, ndim=2] output
|
cdef np.ndarray[attr_t, ndim=2] output
|
||||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||||
# See also #3064
|
# See also #3064
|
||||||
|
@ -985,8 +997,10 @@ cdef class Doc:
|
||||||
py_attr_ids = [py_attr_ids]
|
py_attr_ids = [py_attr_ids]
|
||||||
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
# Allow strings, e.g. 'lemma' or 'LEMMA'
|
||||||
try:
|
try:
|
||||||
py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
py_attr_ids = [
|
||||||
for id_ in py_attr_ids]
|
(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
|
||||||
|
for id_ in py_attr_ids
|
||||||
|
]
|
||||||
except KeyError as msg:
|
except KeyError as msg:
|
||||||
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
|
keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
|
||||||
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
|
raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
|
||||||
|
@ -1016,8 +1030,6 @@ cdef class Doc:
|
||||||
DOCS: https://spacy.io/api/doc#count_by
|
DOCS: https://spacy.io/api/doc#count_by
|
||||||
"""
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef attr_t attr
|
|
||||||
cdef size_t count
|
|
||||||
|
|
||||||
if counts is None:
|
if counts is None:
|
||||||
counts = Counter()
|
counts = Counter()
|
||||||
|
@ -1079,7 +1091,6 @@ cdef class Doc:
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
cdef int32_t abs_head_index
|
cdef int32_t abs_head_index
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
if length != len(self):
|
if length != len(self):
|
||||||
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
||||||
|
@ -1211,7 +1222,7 @@ cdef class Doc:
|
||||||
span.label,
|
span.label,
|
||||||
span.kb_id,
|
span.kb_id,
|
||||||
span.id,
|
span.id,
|
||||||
span.text, # included as a check
|
span.text, # included as a check
|
||||||
))
|
))
|
||||||
char_offset += len(doc.text)
|
char_offset += len(doc.text)
|
||||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
||||||
|
@ -1494,7 +1505,6 @@ cdef class Doc:
|
||||||
attributes are inherited from the syntactic root of the span.
|
attributes are inherited from the syntactic root of the span.
|
||||||
RETURNS (Token): The first newly merged token.
|
RETURNS (Token): The first newly merged token.
|
||||||
"""
|
"""
|
||||||
cdef str tag, lemma, ent_type
|
|
||||||
attr_len = len(attributes)
|
attr_len = len(attributes)
|
||||||
span_len = len(spans)
|
span_len = len(spans)
|
||||||
if not attr_len == span_len:
|
if not attr_len == span_len:
|
||||||
|
@ -1610,7 +1620,6 @@ cdef class Doc:
|
||||||
for token in char_span[1:]:
|
for token in char_span[1:]:
|
||||||
token.is_sent_start = False
|
token.is_sent_start = False
|
||||||
|
|
||||||
|
|
||||||
for span_group in doc_json.get("spans", {}):
|
for span_group in doc_json.get("spans", {}):
|
||||||
spans = []
|
spans = []
|
||||||
for span in doc_json["spans"][span_group]:
|
for span in doc_json["spans"][span_group]:
|
||||||
|
@ -1642,7 +1651,7 @@ cdef class Doc:
|
||||||
start = token_by_char(self.c, self.length, token_data["start"])
|
start = token_by_char(self.c, self.length, token_data["start"])
|
||||||
value = token_data["value"]
|
value = token_data["value"]
|
||||||
self[start]._.set(token_attr, value)
|
self[start]._.set(token_attr, value)
|
||||||
|
|
||||||
for span_attr in doc_json.get("underscore_span", {}):
|
for span_attr in doc_json.get("underscore_span", {}):
|
||||||
if not Span.has_extension(span_attr):
|
if not Span.has_extension(span_attr):
|
||||||
Span.set_extension(span_attr)
|
Span.set_extension(span_attr)
|
||||||
|
@ -1684,7 +1693,7 @@ cdef class Doc:
|
||||||
token_data["dep"] = token.dep_
|
token_data["dep"] = token.dep_
|
||||||
token_data["head"] = token.head.i
|
token_data["head"] = token.head.i
|
||||||
data["tokens"].append(token_data)
|
data["tokens"].append(token_data)
|
||||||
|
|
||||||
if self.spans:
|
if self.spans:
|
||||||
data["spans"] = {}
|
data["spans"] = {}
|
||||||
for span_group in self.spans:
|
for span_group in self.spans:
|
||||||
|
@ -1755,7 +1764,6 @@ cdef class Doc:
|
||||||
output.fill(255)
|
output.fill(255)
|
||||||
cdef int i, j, start_idx, end_idx
|
cdef int i, j, start_idx, end_idx
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef unsigned char utf8_char
|
|
||||||
for i, byte_string in enumerate(byte_strings):
|
for i, byte_string in enumerate(byte_strings):
|
||||||
j = 0
|
j = 0
|
||||||
start_idx = 0
|
start_idx = 0
|
||||||
|
@ -1808,8 +1816,6 @@ cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2
|
||||||
|
|
||||||
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1:
|
||||||
# note: end is exclusive
|
# note: end is exclusive
|
||||||
cdef TokenC* head
|
|
||||||
cdef TokenC* child
|
|
||||||
cdef int i
|
cdef int i
|
||||||
# Set number of left/right children to 0. We'll increment it in the loops.
|
# Set number of left/right children to 0. We'll increment it in the loops.
|
||||||
for i in range(start, end):
|
for i in range(start, end):
|
||||||
|
@ -1909,7 +1915,7 @@ cdef int _get_tokens_lca(Token token_j, Token token_k):
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
cdef int [:, :] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
"""Given a doc and a start and end position defining a set of contiguous
|
"""Given a doc and a start and end position defining a set of contiguous
|
||||||
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
tokens within it, returns a matrix of Lowest Common Ancestors (LCA), where
|
||||||
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
LCA[i, j] is the index of the lowest common ancestor among token i and j.
|
||||||
|
@ -1922,7 +1928,7 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end):
|
||||||
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
RETURNS (int [:, :]): memoryview of numpy.array[ndim=2, dtype=numpy.int32],
|
||||||
with shape (n, n), where n = len(doc).
|
with shape (n, n), where n = len(doc).
|
||||||
"""
|
"""
|
||||||
cdef int [:,:] lca_matrix
|
cdef int [:, :] lca_matrix
|
||||||
cdef int j, k
|
cdef int j, k
|
||||||
n_tokens= end - start
|
n_tokens= end - start
|
||||||
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
lca_mat = numpy.empty((n_tokens, n_tokens), dtype=numpy.int32)
|
||||||
|
|
|
@ -3,7 +3,7 @@ from typing import Generator, List, Tuple
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from cython.operator cimport dereference
|
from cython.operator cimport dereference
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t
|
||||||
from libcpp.pair cimport pair
|
from libcpp.pair cimport pair
|
||||||
from libcpp.unordered_map cimport unordered_map
|
from libcpp.unordered_map cimport unordered_map
|
||||||
from libcpp.unordered_set cimport unordered_set
|
from libcpp.unordered_set cimport unordered_set
|
||||||
|
@ -11,7 +11,6 @@ from libcpp.unordered_set cimport unordered_set
|
||||||
import weakref
|
import weakref
|
||||||
|
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport map_get_unless_missing
|
|
||||||
|
|
||||||
from .. import Errors
|
from .. import Errors
|
||||||
|
|
||||||
|
@ -28,7 +27,7 @@ from .token import Token
|
||||||
cdef class Edge:
|
cdef class Edge:
|
||||||
cdef readonly Graph graph
|
cdef readonly Graph graph
|
||||||
cdef readonly int i
|
cdef readonly int i
|
||||||
|
|
||||||
def __init__(self, Graph graph, int i):
|
def __init__(self, Graph graph, int i):
|
||||||
self.graph = graph
|
self.graph = graph
|
||||||
self.i = i
|
self.i = i
|
||||||
|
@ -44,7 +43,7 @@ cdef class Edge:
|
||||||
@property
|
@property
|
||||||
def head(self) -> "Node":
|
def head(self) -> "Node":
|
||||||
return Node(self.graph, self.graph.c.edges[self.i].head)
|
return Node(self.graph, self.graph.c.edges[self.i].head)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tail(self) -> "Tail":
|
def tail(self) -> "Tail":
|
||||||
return Node(self.graph, self.graph.c.edges[self.i].tail)
|
return Node(self.graph, self.graph.c.edges[self.i].tail)
|
||||||
|
@ -70,7 +69,7 @@ cdef class Node:
|
||||||
def __init__(self, Graph graph, int i):
|
def __init__(self, Graph graph, int i):
|
||||||
"""A reference to a node of an annotation graph. Each node is made up of
|
"""A reference to a node of an annotation graph. Each node is made up of
|
||||||
an ordered set of zero or more token indices.
|
an ordered set of zero or more token indices.
|
||||||
|
|
||||||
Node references are usually created by the Graph object itself, or from
|
Node references are usually created by the Graph object itself, or from
|
||||||
the Node or Edge objects. You usually won't need to instantiate this
|
the Node or Edge objects. You usually won't need to instantiate this
|
||||||
class yourself.
|
class yourself.
|
||||||
|
@ -109,13 +108,13 @@ cdef class Node:
|
||||||
@property
|
@property
|
||||||
def is_none(self) -> bool:
|
def is_none(self) -> bool:
|
||||||
"""Whether the node is a special value, indicating 'none'.
|
"""Whether the node is a special value, indicating 'none'.
|
||||||
|
|
||||||
The NoneNode type is returned by the Graph, Edge and Node objects when
|
The NoneNode type is returned by the Graph, Edge and Node objects when
|
||||||
there is no match to a query. It has the same API as Node, but it always
|
there is no match to a query. It has the same API as Node, but it always
|
||||||
returns NoneNode, NoneEdge or empty lists for its queries.
|
returns NoneNode, NoneEdge or empty lists for its queries.
|
||||||
"""
|
"""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def doc(self) -> "Doc":
|
def doc(self) -> "Doc":
|
||||||
"""The Doc object that the graph refers to."""
|
"""The Doc object that the graph refers to."""
|
||||||
|
@ -130,19 +129,19 @@ cdef class Node:
|
||||||
def head(self, i=None, label=None) -> "Node":
|
def head(self, i=None, label=None) -> "Node":
|
||||||
"""Get the head of the first matching edge, searching by index, label,
|
"""Get the head of the first matching edge, searching by index, label,
|
||||||
both or neither.
|
both or neither.
|
||||||
|
|
||||||
For instance, `node.head(i=1)` will get the head of the second edge that
|
For instance, `node.head(i=1)` will get the head of the second edge that
|
||||||
this node is a tail of. `node.head(i=1, label="ARG0")` will further
|
this node is a tail of. `node.head(i=1, label="ARG0")` will further
|
||||||
check that the second edge has the label `"ARG0"`.
|
check that the second edge has the label `"ARG0"`.
|
||||||
|
|
||||||
If no matching node can be found, the graph's NoneNode is returned.
|
If no matching node can be found, the graph's NoneNode is returned.
|
||||||
"""
|
"""
|
||||||
return self.headed(i=i, label=label)
|
return self.headed(i=i, label=label)
|
||||||
|
|
||||||
def tail(self, i=None, label=None) -> "Node":
|
def tail(self, i=None, label=None) -> "Node":
|
||||||
"""Get the tail of the first matching edge, searching by index, label,
|
"""Get the tail of the first matching edge, searching by index, label,
|
||||||
both or neither.
|
both or neither.
|
||||||
|
|
||||||
If no matching node can be found, the graph's NoneNode is returned.
|
If no matching node can be found, the graph's NoneNode is returned.
|
||||||
"""
|
"""
|
||||||
return self.tailed(i=i, label=label).tail
|
return self.tailed(i=i, label=label).tail
|
||||||
|
@ -171,7 +170,7 @@ cdef class Node:
|
||||||
cdef vector[int] edge_indices
|
cdef vector[int] edge_indices
|
||||||
self._find_edges(edge_indices, "head", label)
|
self._find_edges(edge_indices, "head", label)
|
||||||
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
|
return [Node(self.graph, self.graph.c.edges[i].head) for i in edge_indices]
|
||||||
|
|
||||||
def tails(self, label=None) -> List["Node"]:
|
def tails(self, label=None) -> List["Node"]:
|
||||||
"""Find all matching tails of this node."""
|
"""Find all matching tails of this node."""
|
||||||
cdef vector[int] edge_indices
|
cdef vector[int] edge_indices
|
||||||
|
@ -200,7 +199,7 @@ cdef class Node:
|
||||||
return NoneEdge(self.graph)
|
return NoneEdge(self.graph)
|
||||||
else:
|
else:
|
||||||
return Edge(self.graph, idx)
|
return Edge(self.graph, idx)
|
||||||
|
|
||||||
def tailed(self, i=None, label=None) -> Edge:
|
def tailed(self, i=None, label=None) -> Edge:
|
||||||
"""Find the first matching edge tailed by this node.
|
"""Find the first matching edge tailed by this node.
|
||||||
If no matching edge can be found, the graph's NoneEdge is returned.
|
If no matching edge can be found, the graph's NoneEdge is returned.
|
||||||
|
@ -283,7 +282,7 @@ cdef class NoneEdge(Edge):
|
||||||
def __init__(self, graph):
|
def __init__(self, graph):
|
||||||
self.graph = graph
|
self.graph = graph
|
||||||
self.i = -1
|
self.i = -1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def doc(self) -> "Doc":
|
def doc(self) -> "Doc":
|
||||||
return self.graph.doc
|
return self.graph.doc
|
||||||
|
@ -291,7 +290,7 @@ cdef class NoneEdge(Edge):
|
||||||
@property
|
@property
|
||||||
def head(self) -> "NoneNode":
|
def head(self) -> "NoneNode":
|
||||||
return NoneNode(self.graph)
|
return NoneNode(self.graph)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tail(self) -> "NoneNode":
|
def tail(self) -> "NoneNode":
|
||||||
return NoneNode(self.graph)
|
return NoneNode(self.graph)
|
||||||
|
@ -319,7 +318,7 @@ cdef class NoneNode(Node):
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_none(self):
|
def is_none(self):
|
||||||
return -1
|
return -1
|
||||||
|
@ -340,14 +339,14 @@ cdef class NoneNode(Node):
|
||||||
|
|
||||||
def walk_heads(self):
|
def walk_heads(self):
|
||||||
yield from []
|
yield from []
|
||||||
|
|
||||||
def walk_tails(self):
|
def walk_tails(self):
|
||||||
yield from []
|
yield from []
|
||||||
|
|
||||||
|
|
||||||
cdef class Graph:
|
cdef class Graph:
|
||||||
"""A set of directed labelled relationships between sets of tokens.
|
"""A set of directed labelled relationships between sets of tokens.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
Construction 1
|
Construction 1
|
||||||
>>> graph = Graph(doc, name="srl")
|
>>> graph = Graph(doc, name="srl")
|
||||||
|
@ -372,7 +371,9 @@ cdef class Graph:
|
||||||
>>> assert graph.has_node((0,))
|
>>> assert graph.has_node((0,))
|
||||||
>>> assert graph.has_edge((0,), (1,3), label="agent")
|
>>> assert graph.has_edge((0,), (1,3), label="agent")
|
||||||
"""
|
"""
|
||||||
def __init__(self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None):
|
def __init__(
|
||||||
|
self, doc, *, name="", nodes=[], edges=[], labels=None, weights=None # no-cython-lint
|
||||||
|
):
|
||||||
"""Create a Graph object.
|
"""Create a Graph object.
|
||||||
|
|
||||||
doc (Doc): The Doc object the graph will refer to.
|
doc (Doc): The Doc object the graph will refer to.
|
||||||
|
@ -438,13 +439,11 @@ cdef class Graph:
|
||||||
|
|
||||||
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
|
def add_edge(self, head, tail, *, label="", weight=None) -> Edge:
|
||||||
"""Add an edge to the graph, connecting two groups of tokens.
|
"""Add an edge to the graph, connecting two groups of tokens.
|
||||||
|
|
||||||
If there is already an edge for the (head, tail, label) triple, it will
|
If there is already an edge for the (head, tail, label) triple, it will
|
||||||
be returned, and no new edge will be created. The weight of the edge
|
be returned, and no new edge will be created. The weight of the edge
|
||||||
will be updated if a weight is specified.
|
will be updated if a weight is specified.
|
||||||
"""
|
"""
|
||||||
label_hash = self.doc.vocab.strings.as_int(label)
|
|
||||||
weight_float = weight if weight is not None else 0.0
|
|
||||||
edge_index = add_edge(
|
edge_index = add_edge(
|
||||||
&self.c,
|
&self.c,
|
||||||
EdgeC(
|
EdgeC(
|
||||||
|
@ -478,11 +477,11 @@ cdef class Graph:
|
||||||
def has_edge(self, head, tail, label) -> bool:
|
def has_edge(self, head, tail, label) -> bool:
|
||||||
"""Check whether a (head, tail, label) triple is an edge in the graph."""
|
"""Check whether a (head, tail, label) triple is an edge in the graph."""
|
||||||
return not self.get_edge(head, tail, label=label).is_none
|
return not self.get_edge(head, tail, label=label).is_none
|
||||||
|
|
||||||
def add_node(self, indices) -> Node:
|
def add_node(self, indices) -> Node:
|
||||||
"""Add a node to the graph and return it. Nodes refer to ordered sets
|
"""Add a node to the graph and return it. Nodes refer to ordered sets
|
||||||
of token indices.
|
of token indices.
|
||||||
|
|
||||||
This method is idempotent: if there is already a node for the given
|
This method is idempotent: if there is already a node for the given
|
||||||
indices, it is returned without a new node being created.
|
indices, it is returned without a new node being created.
|
||||||
"""
|
"""
|
||||||
|
@ -510,7 +509,7 @@ cdef class Graph:
|
||||||
return NoneNode(self)
|
return NoneNode(self)
|
||||||
else:
|
else:
|
||||||
return Node(self, node_index)
|
return Node(self, node_index)
|
||||||
|
|
||||||
def has_node(self, tuple indices) -> bool:
|
def has_node(self, tuple indices) -> bool:
|
||||||
"""Check whether the graph has a node for the given indices."""
|
"""Check whether the graph has a node for the given indices."""
|
||||||
return not self.get_node(indices).is_none
|
return not self.get_node(indices).is_none
|
||||||
|
@ -570,7 +569,7 @@ cdef int add_node(GraphC* graph, vector[int32_t]& node) nogil:
|
||||||
graph.roots.insert(index)
|
graph.roots.insert(index)
|
||||||
graph.node_map.insert(pair[hash_t, int](key, index))
|
graph.node_map.insert(pair[hash_t, int](key, index))
|
||||||
return index
|
return index
|
||||||
|
|
||||||
|
|
||||||
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
|
cdef int get_node(const GraphC* graph, vector[int32_t] node) nogil:
|
||||||
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
|
key = hash64(&node[0], node.size() * sizeof(node[0]), 0)
|
||||||
|
|
|
@ -89,4 +89,3 @@ cdef class MorphAnalysis:
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return self.to_json()
|
return self.to_json()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from libc.math cimport sqrt
|
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -8,13 +7,13 @@ import numpy
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
|
||||||
from ..attrs cimport *
|
from ..attrs cimport *
|
||||||
from ..attrs cimport attr_id_t
|
from ..attrs cimport ORTH, attr_id_t
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..structs cimport TokenC
|
||||||
from ..structs cimport LexemeC, TokenC
|
|
||||||
from ..symbols cimport dep
|
from ..symbols cimport dep
|
||||||
from ..typedefs cimport attr_t, flags_t, hash_t
|
from ..typedefs cimport attr_t, hash_t
|
||||||
from .doc cimport _get_lca_matrix, get_token_attr, token_by_end, token_by_start
|
from .doc cimport _get_lca_matrix, get_token_attr
|
||||||
|
from .token cimport Token
|
||||||
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
|
@ -341,13 +340,26 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.doc.user_span_hooks:
|
if "similarity" in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks["similarity"](self, other)
|
return self.doc.user_span_hooks["similarity"](self, other)
|
||||||
if len(self) == 1 and hasattr(other, "orth"):
|
attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
|
||||||
if self[0].orth == other.orth:
|
cdef Token this_token
|
||||||
|
cdef Token other_token
|
||||||
|
cdef Lexeme other_lex
|
||||||
|
if len(self) == 1 and isinstance(other, Token):
|
||||||
|
this_token = self[0]
|
||||||
|
other_token = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
|
||||||
|
return 1.0
|
||||||
|
elif len(self) == 1 and isinstance(other, Lexeme):
|
||||||
|
this_token = self[0]
|
||||||
|
other_lex = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
||||||
similar = True
|
similar = True
|
||||||
for i in range(len(self)):
|
for i in range(len(self)):
|
||||||
if self[i].orth != getattr(other[i], "orth", None):
|
this_token = self[i]
|
||||||
|
other_token = other[i]
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) != Token.get_struct_attr(other_token.c, attr):
|
||||||
similar = False
|
similar = False
|
||||||
break
|
break
|
||||||
if similar:
|
if similar:
|
||||||
|
@ -581,7 +593,6 @@ cdef class Span:
|
||||||
"""
|
"""
|
||||||
return "".join([t.text_with_ws for t in self])
|
return "".join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def noun_chunks(self):
|
def noun_chunks(self):
|
||||||
"""Iterate over the base noun phrases in the span. Yields base
|
"""Iterate over the base noun phrases in the span. Yields base
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import struct
|
import struct
|
||||||
import weakref
|
import weakref
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Union
|
from typing import Iterable, Optional, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ cdef class SpanGroup:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/spangroup
|
DOCS: https://spacy.io/api/spangroup
|
||||||
"""
|
"""
|
||||||
def __init__(self, doc, *, name="", attrs={}, spans=[]):
|
def __init__(self, doc, *, name="", attrs={}, spans=[]): # no-cython-lint
|
||||||
"""Create a SpanGroup.
|
"""Create a SpanGroup.
|
||||||
|
|
||||||
doc (Doc): The reference Doc object.
|
doc (Doc): The reference Doc object.
|
||||||
|
@ -311,7 +311,7 @@ cdef class SpanGroup:
|
||||||
|
|
||||||
other_attrs = deepcopy(other_group.attrs)
|
other_attrs = deepcopy(other_group.attrs)
|
||||||
span_group.attrs.update({
|
span_group.attrs.update({
|
||||||
key: value for key, value in other_attrs.items() \
|
key: value for key, value in other_attrs.items()
|
||||||
if key not in span_group.attrs
|
if key not in span_group.attrs
|
||||||
})
|
})
|
||||||
if len(other_group):
|
if len(other_group):
|
||||||
|
|
|
@ -26,7 +26,7 @@ cdef class Token:
|
||||||
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
# cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):
|
||||||
# cdef TokenC token
|
# cdef TokenC token
|
||||||
# attrs = normalize_attrs(attrs)
|
# attrs = normalize_attrs(attrs)
|
||||||
|
|
||||||
|
@ -98,12 +98,10 @@ cdef class Token:
|
||||||
elif feat_name == SENT_START:
|
elif feat_name == SENT_START:
|
||||||
token.sent_start = value
|
token.sent_start = value
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline int missing_dep(const TokenC* token) nogil:
|
cdef inline int missing_dep(const TokenC* token) nogil:
|
||||||
return token.dep == MISSING_DEP
|
return token.dep == MISSING_DEP
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline int missing_head(const TokenC* token) nogil:
|
cdef inline int missing_head(const TokenC* token) nogil:
|
||||||
return Token.missing_dep(token)
|
return Token.missing_dep(token)
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# Compiler crashes on memory view coercion without this. Should report bug.
|
# Compiler crashes on memory view coercion without this. Should report bug.
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from cython.view cimport array as cvarray
|
|
||||||
|
|
||||||
np.import_array()
|
np.import_array()
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import numpy
|
|
||||||
from thinc.api import get_array_module
|
from thinc.api import get_array_module
|
||||||
|
|
||||||
from ..attrs cimport (
|
from ..attrs cimport (
|
||||||
|
@ -28,6 +26,7 @@ from ..attrs cimport (
|
||||||
LIKE_EMAIL,
|
LIKE_EMAIL,
|
||||||
LIKE_NUM,
|
LIKE_NUM,
|
||||||
LIKE_URL,
|
LIKE_URL,
|
||||||
|
ORTH,
|
||||||
)
|
)
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
from ..symbols cimport conj
|
from ..symbols cimport conj
|
||||||
|
@ -214,11 +213,17 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
if "similarity" in self.doc.user_token_hooks:
|
if "similarity" in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks["similarity"](self, other)
|
return self.doc.user_token_hooks["similarity"](self, other)
|
||||||
if hasattr(other, "__len__") and len(other) == 1 and hasattr(other, "__getitem__"):
|
attr = getattr(self.doc.vocab.vectors, "attr", ORTH)
|
||||||
if self.c.lex.orth == getattr(other[0], "orth", None):
|
cdef Token this_token = self
|
||||||
|
cdef Token other_token
|
||||||
|
cdef Lexeme other_lex
|
||||||
|
if isinstance(other, Token):
|
||||||
|
other_token = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Token.get_struct_attr(other_token.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
elif hasattr(other, "orth"):
|
elif isinstance(other, Lexeme):
|
||||||
if self.c.lex.orth == other.orth:
|
other_lex = other
|
||||||
|
if Token.get_struct_attr(this_token.c, attr) == Lexeme.get_struct_attr(other_lex.c, attr):
|
||||||
return 1.0
|
return 1.0
|
||||||
if self.vocab.vectors.n_keys == 0:
|
if self.vocab.vectors.n_keys == 0:
|
||||||
warnings.warn(Warnings.W007.format(obj="Token"))
|
warnings.warn(Warnings.W007.format(obj="Token"))
|
||||||
|
@ -231,7 +236,7 @@ cdef class Token:
|
||||||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||||
return result.item()
|
return result.item()
|
||||||
|
|
||||||
def has_morph(self):
|
def has_morph(self):
|
||||||
"""Check whether the token has annotated morph information.
|
"""Check whether the token has annotated morph information.
|
||||||
Return False when the morph annotation is unset/missing.
|
Return False when the morph annotation is unset/missing.
|
||||||
|
@ -415,7 +420,7 @@ cdef class Token:
|
||||||
return self.doc.user_token_hooks["has_vector"](self)
|
return self.doc.user_token_hooks["has_vector"](self)
|
||||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||||
return True
|
return True
|
||||||
return self.vocab.has_vector(self.c.lex.orth)
|
return self.vocab.has_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vector(self):
|
def vector(self):
|
||||||
|
@ -431,7 +436,7 @@ cdef class Token:
|
||||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||||
return self.doc.tensor[self.i]
|
return self.doc.tensor[self.i]
|
||||||
else:
|
else:
|
||||||
return self.vocab.get_vector(self.c.lex.orth)
|
return self.vocab.get_vector(Token.get_struct_attr(self.c, self.vocab.vectors.attr))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vector_norm(self):
|
def vector_norm(self):
|
||||||
|
@ -538,9 +543,9 @@ cdef class Token:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self.i + 1 == len(self.doc):
|
if self.i + 1 == len(self.doc):
|
||||||
return True
|
return True
|
||||||
elif self.doc[self.i+1].is_sent_start == None:
|
elif self.doc[self.i+1].is_sent_start is None:
|
||||||
return None
|
return None
|
||||||
elif self.doc[self.i+1].is_sent_start == True:
|
elif self.doc[self.i+1].is_sent_start is True:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -37,10 +37,14 @@ def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[Li
|
||||||
b2a.append(set())
|
b2a.append(set())
|
||||||
# Process the alignment at the current position
|
# Process the alignment at the current position
|
||||||
if A[token_idx_a] == B[token_idx_b] and \
|
if A[token_idx_a] == B[token_idx_b] and \
|
||||||
(char_idx_a == 0 or \
|
(
|
||||||
char_to_token_a[char_idx_a - 1] < token_idx_a) and \
|
char_idx_a == 0 or
|
||||||
(char_idx_b == 0 or \
|
char_to_token_a[char_idx_a - 1] < token_idx_a
|
||||||
char_to_token_b[char_idx_b - 1] < token_idx_b):
|
) and \
|
||||||
|
(
|
||||||
|
char_idx_b == 0 or
|
||||||
|
char_to_token_b[char_idx_b - 1] < token_idx_b
|
||||||
|
):
|
||||||
# Current tokens are identical and both character offsets are the
|
# Current tokens are identical and both character offsets are the
|
||||||
# start of a token (either at the beginning of the document or the
|
# start of a token (either at the beginning of the document or the
|
||||||
# previous character belongs to a different token)
|
# previous character belongs to a different token)
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import warnings
|
|
||||||
from collections.abc import Iterable as IterableInstance
|
from collections.abc import Iterable as IterableInstance
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -31,9 +30,9 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||||
if "entities" in doc_annot:
|
if "entities" in doc_annot:
|
||||||
_add_entities_to_doc(output, doc_annot["entities"])
|
_add_entities_to_doc(output, doc_annot["entities"])
|
||||||
if "spans" in doc_annot:
|
if "spans" in doc_annot:
|
||||||
_add_spans_to_doc(output, doc_annot["spans"])
|
_add_spans_to_doc(output, doc_annot["spans"])
|
||||||
if array.size:
|
if array.size:
|
||||||
output = output.from_array(attrs, array)
|
output = output.from_array(attrs, array)
|
||||||
# links are currently added with ENT_KB_ID on the token level
|
# links are currently added with ENT_KB_ID on the token level
|
||||||
|
@ -161,7 +160,6 @@ cdef class Example:
|
||||||
self._y_sig = y_sig
|
self._y_sig = y_sig
|
||||||
return self._cached_alignment
|
return self._cached_alignment
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_vectorized(self, align, gold_values):
|
def _get_aligned_vectorized(self, align, gold_values):
|
||||||
# Fast path for Doc attributes/fields that are predominantly a single value,
|
# Fast path for Doc attributes/fields that are predominantly a single value,
|
||||||
# i.e., TAG, POS, MORPH.
|
# i.e., TAG, POS, MORPH.
|
||||||
|
@ -204,7 +202,6 @@ cdef class Example:
|
||||||
|
|
||||||
return output.tolist()
|
return output.tolist()
|
||||||
|
|
||||||
|
|
||||||
def _get_aligned_non_vectorized(self, align, gold_values):
|
def _get_aligned_non_vectorized(self, align, gold_values):
|
||||||
# Slower path for fields that return multiple values (resulting
|
# Slower path for fields that return multiple values (resulting
|
||||||
# in ragged arrays that cannot be vectorized trivially).
|
# in ragged arrays that cannot be vectorized trivially).
|
||||||
|
@ -221,7 +218,6 @@ cdef class Example:
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_aligned(self, field, as_string=False):
|
def get_aligned(self, field, as_string=False):
|
||||||
"""Return an aligned array for a token attribute."""
|
"""Return an aligned array for a token attribute."""
|
||||||
align = self.alignment.x2y
|
align = self.alignment.x2y
|
||||||
|
@ -330,7 +326,7 @@ cdef class Example:
|
||||||
missing=None
|
missing=None
|
||||||
)
|
)
|
||||||
# Now fill the tokens we can align to O.
|
# Now fill the tokens we can align to O.
|
||||||
O = 2 # I=1, O=2, B=3
|
O = 2 # I=1, O=2, B=3 # no-cython-lint: E741
|
||||||
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
for i, ent_iob in enumerate(self.get_aligned("ENT_IOB")):
|
||||||
if x_tags[i] is None:
|
if x_tags[i] is None:
|
||||||
if ent_iob == O:
|
if ent_iob == O:
|
||||||
|
@ -340,7 +336,7 @@ cdef class Example:
|
||||||
return x_ents, x_tags
|
return x_ents, x_tags
|
||||||
|
|
||||||
def get_aligned_ner(self):
|
def get_aligned_ner(self):
|
||||||
x_ents, x_tags = self.get_aligned_ents_and_ner()
|
_x_ents, x_tags = self.get_aligned_ents_and_ner()
|
||||||
return x_tags
|
return x_tags
|
||||||
|
|
||||||
def get_matching_ents(self, check_label=True):
|
def get_matching_ents(self, check_label=True):
|
||||||
|
@ -398,7 +394,6 @@ cdef class Example:
|
||||||
|
|
||||||
return span_dict
|
return span_dict
|
||||||
|
|
||||||
|
|
||||||
def _links_to_dict(self):
|
def _links_to_dict(self):
|
||||||
links = {}
|
links = {}
|
||||||
for ent in self.reference.ents:
|
for ent in self.reference.ents:
|
||||||
|
@ -589,6 +584,7 @@ def _fix_legacy_dict_data(example_dict):
|
||||||
"doc_annotation": doc_dict
|
"doc_annotation": doc_dict
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _has_field(annot, field):
|
def _has_field(annot, field):
|
||||||
if field not in annot:
|
if field not in annot:
|
||||||
return False
|
return False
|
||||||
|
@ -625,6 +621,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
||||||
ent_types.append("")
|
ent_types.append("")
|
||||||
return ent_iobs, ent_types
|
return ent_iobs, ent_types
|
||||||
|
|
||||||
|
|
||||||
def _parse_links(vocab, words, spaces, links):
|
def _parse_links(vocab, words, spaces, links):
|
||||||
reference = Doc(vocab, words=words, spaces=spaces)
|
reference = Doc(vocab, words=words, spaces=spaces)
|
||||||
starts = {token.idx: token.i for token in reference}
|
starts = {token.idx: token.i for token in reference}
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import json
|
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -6,7 +5,7 @@ import srsly
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Warnings
|
from ..errors import Warnings
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
|
from .iob_utils import offsets_to_biluo_tags
|
||||||
|
|
||||||
|
|
||||||
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
|
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
json_doc = {"id": doc_id, "paragraphs": []}
|
json_doc = {"id": doc_id, "paragraphs": []}
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
raw = None if doc.has_unknown_spaces else doc.text
|
raw = None if doc.has_unknown_spaces else doc.text
|
||||||
json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
|
json_para = {
|
||||||
|
'raw': raw,
|
||||||
|
"sentences": [],
|
||||||
|
"cats": [],
|
||||||
|
"entities": [],
|
||||||
|
"links": []
|
||||||
|
}
|
||||||
for cat, val in doc.cats.items():
|
for cat, val in doc.cats.items():
|
||||||
json_cat = {"label": cat, "value": val}
|
json_cat = {"label": cat, "value": val}
|
||||||
json_para["cats"].append(json_cat)
|
json_para["cats"].append(json_cat)
|
||||||
|
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
if ent.kb_id_:
|
if ent.kb_id_:
|
||||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||||
json_para["links"].append(link_dict)
|
json_para["links"].append(link_dict)
|
||||||
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
|
biluo_tags = offsets_to_biluo_tags(
|
||||||
|
doc, json_para["entities"], missing=ner_missing_tag
|
||||||
|
)
|
||||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
json_sent = {"tokens": [], "brackets": []}
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
|
json_token = {
|
||||||
|
"id": token.i, "orth": token.text, "space": token.whitespace_
|
||||||
|
}
|
||||||
if include_annotation["TAG"]:
|
if include_annotation["TAG"]:
|
||||||
json_token["tag"] = token.tag_
|
json_token["tag"] = token.tag_
|
||||||
if include_annotation["POS"]:
|
if include_annotation["POS"]:
|
||||||
|
@ -125,9 +134,14 @@ def json_to_annotations(doc):
|
||||||
else:
|
else:
|
||||||
sent_starts.append(-1)
|
sent_starts.append(-1)
|
||||||
if "brackets" in sent:
|
if "brackets" in sent:
|
||||||
brackets.extend((b["first"] + sent_start_i,
|
brackets.extend(
|
||||||
b["last"] + sent_start_i, b["label"])
|
(
|
||||||
for b in sent["brackets"])
|
b["first"] + sent_start_i,
|
||||||
|
b["last"] + sent_start_i,
|
||||||
|
b["label"]
|
||||||
|
)
|
||||||
|
for b in sent["brackets"]
|
||||||
|
)
|
||||||
|
|
||||||
example["token_annotation"] = dict(
|
example["token_annotation"] = dict(
|
||||||
ids=ids,
|
ids=ids,
|
||||||
|
@ -160,6 +174,7 @@ def json_to_annotations(doc):
|
||||||
)
|
)
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
|
||||||
def json_iterate(bytes utf8_str):
|
def json_iterate(bytes utf8_str):
|
||||||
# We should've made these files jsonl...But since we didn't, parse out
|
# We should've made these files jsonl...But since we didn't, parse out
|
||||||
# the docs one-by-one to reduce memory usage.
|
# the docs one-by-one to reduce memory usage.
|
||||||
|
|
|
@ -76,7 +76,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
logger.info("Resuming training for: %s", resume_components)
|
logger.info("Resuming training for: %s", resume_components)
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
# Make sure that listeners are defined before initializing further
|
# Make sure that internal component names are synced and listeners are
|
||||||
|
# defined before initializing further
|
||||||
nlp._link_components()
|
nlp._link_components()
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
if T["max_epochs"] == -1:
|
if T["max_epochs"] == -1:
|
||||||
|
@ -215,9 +216,14 @@ def convert_vectors(
|
||||||
prune: int,
|
prune: int,
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
mode: str = VectorsMode.default,
|
mode: str = VectorsMode.default,
|
||||||
|
attr: str = "ORTH",
|
||||||
) -> None:
|
) -> None:
|
||||||
vectors_loc = ensure_path(vectors_loc)
|
vectors_loc = ensure_path(vectors_loc)
|
||||||
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
||||||
|
if attr != "ORTH":
|
||||||
|
raise ValueError(
|
||||||
|
"ORTH is the only attribute supported for vectors in .npz format."
|
||||||
|
)
|
||||||
nlp.vocab.vectors = Vectors(
|
nlp.vocab.vectors = Vectors(
|
||||||
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
|
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
|
||||||
)
|
)
|
||||||
|
@ -245,11 +251,15 @@ def convert_vectors(
|
||||||
nlp.vocab.vectors = Vectors(
|
nlp.vocab.vectors = Vectors(
|
||||||
strings=nlp.vocab.strings,
|
strings=nlp.vocab.strings,
|
||||||
data=vectors_data,
|
data=vectors_data,
|
||||||
|
attr=attr,
|
||||||
**floret_settings,
|
**floret_settings,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
nlp.vocab.vectors = Vectors(
|
nlp.vocab.vectors = Vectors(
|
||||||
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
strings=nlp.vocab.strings,
|
||||||
|
data=vectors_data,
|
||||||
|
keys=vector_keys,
|
||||||
|
attr=attr,
|
||||||
)
|
)
|
||||||
nlp.vocab.deduplicate_vectors()
|
nlp.vocab.deduplicate_vectors()
|
||||||
if name is None:
|
if name is None:
|
||||||
|
|
|
@ -534,7 +534,7 @@ def load_model_from_path(
|
||||||
if not meta:
|
if not meta:
|
||||||
meta = get_model_meta(model_path)
|
meta = get_model_meta(model_path)
|
||||||
config_path = model_path / "config.cfg"
|
config_path = model_path / "config.cfg"
|
||||||
overrides = dict_to_dot(config)
|
overrides = dict_to_dot(config, for_overrides=True)
|
||||||
config = load_config(config_path, overrides=overrides)
|
config = load_config(config_path, overrides=overrides)
|
||||||
nlp = load_model_from_config(
|
nlp = load_model_from_config(
|
||||||
config,
|
config,
|
||||||
|
@ -1502,14 +1502,19 @@ def dot_to_dict(values: Dict[str, Any]) -> Dict[str, dict]:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]:
|
def dict_to_dot(obj: Dict[str, dict], *, for_overrides: bool = False) -> Dict[str, Any]:
|
||||||
"""Convert dot notation to a dict. For example: {"token": {"pos": True,
|
"""Convert dot notation to a dict. For example: {"token": {"pos": True,
|
||||||
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
|
"_": {"xyz": True }}} becomes {"token.pos": True, "token._.xyz": True}.
|
||||||
|
|
||||||
values (Dict[str, dict]): The dict to convert.
|
obj (Dict[str, dict]): The dict to convert.
|
||||||
|
for_overrides (bool): Whether to enable special handling for registered
|
||||||
|
functions in overrides.
|
||||||
RETURNS (Dict[str, Any]): The key/value pairs.
|
RETURNS (Dict[str, Any]): The key/value pairs.
|
||||||
"""
|
"""
|
||||||
return {".".join(key): value for key, value in walk_dict(obj)}
|
return {
|
||||||
|
".".join(key): value
|
||||||
|
for key, value in walk_dict(obj, for_overrides=for_overrides)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def dot_to_object(config: Config, section: str):
|
def dot_to_object(config: Config, section: str):
|
||||||
|
@ -1551,13 +1556,20 @@ def set_dot_to_object(config: Config, section: str, value: Any) -> None:
|
||||||
|
|
||||||
|
|
||||||
def walk_dict(
|
def walk_dict(
|
||||||
node: Dict[str, Any], parent: List[str] = []
|
node: Dict[str, Any], parent: List[str] = [], *, for_overrides: bool = False
|
||||||
) -> Iterator[Tuple[List[str], Any]]:
|
) -> Iterator[Tuple[List[str], Any]]:
|
||||||
"""Walk a dict and yield the path and values of the leaves."""
|
"""Walk a dict and yield the path and values of the leaves.
|
||||||
|
|
||||||
|
for_overrides (bool): Whether to treat registered functions that start with
|
||||||
|
@ as final values rather than dicts to traverse.
|
||||||
|
"""
|
||||||
for key, value in node.items():
|
for key, value in node.items():
|
||||||
key_parent = [*parent, key]
|
key_parent = [*parent, key]
|
||||||
if isinstance(value, dict):
|
if isinstance(value, dict) and (
|
||||||
yield from walk_dict(value, key_parent)
|
not for_overrides
|
||||||
|
or not any(value_key.startswith("@") for value_key in value)
|
||||||
|
):
|
||||||
|
yield from walk_dict(value, key_parent, for_overrides=for_overrides)
|
||||||
else:
|
else:
|
||||||
yield (key_parent, value)
|
yield (key_parent, value)
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
cimport numpy as np
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.stdint cimport uint32_t, uint64_t
|
||||||
from libcpp.set cimport set as cppset
|
from libcpp.set cimport set as cppset
|
||||||
from murmurhash.mrmr cimport hash128_x64
|
from murmurhash.mrmr cimport hash128_x64
|
||||||
|
|
||||||
import functools
|
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
@ -15,9 +13,11 @@ from thinc.api import Ops, get_array_module, get_current_ops
|
||||||
from thinc.backends import get_array_ops
|
from thinc.backends import get_array_ops
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
|
from .attrs cimport ORTH, attr_id_t
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from .attrs import IDS
|
||||||
from .errors import Errors, Warnings
|
from .errors import Errors, Warnings
|
||||||
from .strings import get_string_id
|
from .strings import get_string_id
|
||||||
|
|
||||||
|
@ -64,8 +64,9 @@ cdef class Vectors:
|
||||||
cdef readonly uint32_t hash_seed
|
cdef readonly uint32_t hash_seed
|
||||||
cdef readonly unicode bow
|
cdef readonly unicode bow
|
||||||
cdef readonly unicode eow
|
cdef readonly unicode eow
|
||||||
|
cdef readonly attr_id_t attr
|
||||||
|
|
||||||
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
|
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">", attr="ORTH"):
|
||||||
"""Create a new vector store.
|
"""Create a new vector store.
|
||||||
|
|
||||||
strings (StringStore): The string store.
|
strings (StringStore): The string store.
|
||||||
|
@ -80,6 +81,8 @@ cdef class Vectors:
|
||||||
hash_seed (int): The floret hash seed (default: 0).
|
hash_seed (int): The floret hash seed (default: 0).
|
||||||
bow (str): The floret BOW string (default: "<").
|
bow (str): The floret BOW string (default: "<").
|
||||||
eow (str): The floret EOW string (default: ">").
|
eow (str): The floret EOW string (default: ">").
|
||||||
|
attr (Union[int, str]): The token attribute for the vector keys
|
||||||
|
(default: "ORTH").
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#init
|
DOCS: https://spacy.io/api/vectors#init
|
||||||
"""
|
"""
|
||||||
|
@ -103,10 +106,18 @@ cdef class Vectors:
|
||||||
self.hash_seed = hash_seed
|
self.hash_seed = hash_seed
|
||||||
self.bow = bow
|
self.bow = bow
|
||||||
self.eow = eow
|
self.eow = eow
|
||||||
|
if isinstance(attr, (int, long)):
|
||||||
|
self.attr = attr
|
||||||
|
else:
|
||||||
|
attr = attr.upper()
|
||||||
|
if attr == "TEXT":
|
||||||
|
attr = "ORTH"
|
||||||
|
self.attr = IDS.get(attr, ORTH)
|
||||||
|
|
||||||
if self.mode == Mode.default:
|
if self.mode == Mode.default:
|
||||||
if data is None:
|
if data is None:
|
||||||
if shape is None:
|
if shape is None:
|
||||||
shape = (0,0)
|
shape = (0, 0)
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
data = ops.xp.zeros(shape, dtype="f")
|
data = ops.xp.zeros(shape, dtype="f")
|
||||||
self._unset = cppset[int]({i for i in range(data.shape[0])})
|
self._unset = cppset[int]({i for i in range(data.shape[0])})
|
||||||
|
@ -247,11 +258,10 @@ cdef class Vectors:
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
# Check for equality, with faster checks first
|
# Check for equality, with faster checks first
|
||||||
return (
|
return (
|
||||||
self.shape == other.shape
|
self.shape == other.shape
|
||||||
and self.key2row == other.key2row
|
and self.key2row == other.key2row
|
||||||
and self.to_bytes(exclude=["strings"])
|
and self.to_bytes(exclude=["strings"]) == other.to_bytes(exclude=["strings"])
|
||||||
== other.to_bytes(exclude=["strings"])
|
)
|
||||||
)
|
|
||||||
|
|
||||||
def resize(self, shape, inplace=False):
|
def resize(self, shape, inplace=False):
|
||||||
"""Resize the underlying vectors array. If inplace=True, the memory
|
"""Resize the underlying vectors array. If inplace=True, the memory
|
||||||
|
@ -507,11 +517,12 @@ cdef class Vectors:
|
||||||
# vectors e.g. (10000, 300)
|
# vectors e.g. (10000, 300)
|
||||||
# sims e.g. (1024, 10000)
|
# sims e.g. (1024, 10000)
|
||||||
sims = xp.dot(batch, vectors.T)
|
sims = xp.dot(batch, vectors.T)
|
||||||
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
|
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:, -n:]
|
||||||
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
|
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:, -n:]
|
||||||
|
|
||||||
if sort and n >= 2:
|
if sort and n >= 2:
|
||||||
sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
|
sorted_index = xp.arange(scores.shape[0])[:, None][i:i+batch_size], \
|
||||||
|
xp.argsort(scores[i:i+batch_size], axis=1)[:, ::-1]
|
||||||
scores[i:i+batch_size] = scores[sorted_index]
|
scores[i:i+batch_size] = scores[sorted_index]
|
||||||
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
best_rows[i:i+batch_size] = best_rows[sorted_index]
|
||||||
|
|
||||||
|
@ -525,8 +536,12 @@ cdef class Vectors:
|
||||||
|
|
||||||
numpy_rows = get_current_ops().to_numpy(best_rows)
|
numpy_rows = get_current_ops().to_numpy(best_rows)
|
||||||
keys = xp.asarray(
|
keys = xp.asarray(
|
||||||
[[row2key[row] for row in numpy_rows[i] if row in row2key]
|
[
|
||||||
for i in range(len(queries)) ], dtype="uint64")
|
[row2key[row] for row in numpy_rows[i] if row in row2key]
|
||||||
|
for i in range(len(queries))
|
||||||
|
],
|
||||||
|
dtype="uint64"
|
||||||
|
)
|
||||||
return (keys, best_rows, scores)
|
return (keys, best_rows, scores)
|
||||||
|
|
||||||
def to_ops(self, ops: Ops):
|
def to_ops(self, ops: Ops):
|
||||||
|
@ -546,6 +561,7 @@ cdef class Vectors:
|
||||||
"hash_seed": self.hash_seed,
|
"hash_seed": self.hash_seed,
|
||||||
"bow": self.bow,
|
"bow": self.bow,
|
||||||
"eow": self.eow,
|
"eow": self.eow,
|
||||||
|
"attr": self.attr,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _set_cfg(self, cfg):
|
def _set_cfg(self, cfg):
|
||||||
|
@ -556,6 +572,7 @@ cdef class Vectors:
|
||||||
self.hash_seed = cfg.get("hash_seed", 0)
|
self.hash_seed = cfg.get("hash_seed", 0)
|
||||||
self.bow = cfg.get("bow", "<")
|
self.bow = cfg.get("bow", "<")
|
||||||
self.eow = cfg.get("eow", ">")
|
self.eow = cfg.get("eow", ">")
|
||||||
|
self.attr = cfg.get("attr", ORTH)
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
@ -567,9 +584,9 @@ cdef class Vectors:
|
||||||
"""
|
"""
|
||||||
xp = get_array_module(self.data)
|
xp = get_array_module(self.data)
|
||||||
if xp is numpy:
|
if xp is numpy:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
|
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) # no-cython-lint
|
||||||
else:
|
else:
|
||||||
save_array = lambda arr, file_: xp.save(file_, arr)
|
save_array = lambda arr, file_: xp.save(file_, arr) # no-cython-lint
|
||||||
|
|
||||||
def save_vectors(path):
|
def save_vectors(path):
|
||||||
# the source of numpy.save indicates that the file object is closed after use.
|
# the source of numpy.save indicates that the file object is closed after use.
|
||||||
|
|
|
@ -32,7 +32,7 @@ cdef class Vocab:
|
||||||
cdef public object writing_system
|
cdef public object writing_system
|
||||||
cdef public object get_noun_chunks
|
cdef public object get_noun_chunks
|
||||||
cdef readonly int length
|
cdef readonly int length
|
||||||
cdef public object _unused_object # TODO remove in v4, see #9150
|
cdef public object _unused_object # TODO remove in v4, see #9150
|
||||||
cdef public object lex_attr_getters
|
cdef public object lex_attr_getters
|
||||||
cdef public object cfg
|
cdef public object cfg
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,4 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
from libc.string cimport memcpy
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -19,7 +17,6 @@ from .errors import Errors
|
||||||
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
|
from .lang.lex_attrs import LEX_ATTRS, get_lang, is_stop
|
||||||
from .lang.norm_exceptions import BASE_NORMS
|
from .lang.norm_exceptions import BASE_NORMS
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
from .util import registry
|
|
||||||
from .vectors import Mode as VectorsMode
|
from .vectors import Mode as VectorsMode
|
||||||
from .vectors import Vectors
|
from .vectors import Vectors
|
||||||
|
|
||||||
|
@ -51,9 +48,17 @@ cdef class Vocab:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vocab
|
DOCS: https://spacy.io/api/vocab
|
||||||
"""
|
"""
|
||||||
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
|
def __init__(
|
||||||
oov_prob=-20., vectors_name=None, writing_system={},
|
self,
|
||||||
get_noun_chunks=None, **deprecated_kwargs):
|
lex_attr_getters=None,
|
||||||
|
strings=tuple(),
|
||||||
|
lookups=None,
|
||||||
|
oov_prob=-20.,
|
||||||
|
vectors_name=None,
|
||||||
|
writing_system={}, # no-cython-lint
|
||||||
|
get_noun_chunks=None,
|
||||||
|
**deprecated_kwargs
|
||||||
|
):
|
||||||
"""Create the vocabulary.
|
"""Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
lex_attr_getters (dict): A dictionary mapping attribute IDs to
|
||||||
|
@ -150,7 +155,6 @@ cdef class Vocab:
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
cdef hash_t key = self.strings[string]
|
cdef hash_t key = self.strings[string]
|
||||||
lex = <LexemeC*>self._by_orth.get(key)
|
lex = <LexemeC*>self._by_orth.get(key)
|
||||||
cdef size_t addr
|
|
||||||
if lex != NULL:
|
if lex != NULL:
|
||||||
assert lex.orth in self.strings
|
assert lex.orth in self.strings
|
||||||
if lex.orth != key:
|
if lex.orth != key:
|
||||||
|
@ -183,7 +187,7 @@ cdef class Vocab:
|
||||||
# of the doc ownership).
|
# of the doc ownership).
|
||||||
# TODO: Change the C API so that the mem isn't passed in here.
|
# TODO: Change the C API so that the mem isn't passed in here.
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
#if len(string) < 3 or self.length < 10000:
|
# if len(string) < 3 or self.length < 10000:
|
||||||
# mem = self.mem
|
# mem = self.mem
|
||||||
cdef bint is_oov = mem is not self.mem
|
cdef bint is_oov = mem is not self.mem
|
||||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||||
|
@ -365,8 +369,13 @@ cdef class Vocab:
|
||||||
self[orth]
|
self[orth]
|
||||||
# Make prob negative so it sorts by rank ascending
|
# Make prob negative so it sorts by rank ascending
|
||||||
# (key2row contains the rank)
|
# (key2row contains the rank)
|
||||||
priority = [(-lex.prob, self.vectors.key2row[lex.orth], lex.orth)
|
priority = []
|
||||||
for lex in self if lex.orth in self.vectors.key2row]
|
cdef Lexeme lex
|
||||||
|
cdef attr_t value
|
||||||
|
for lex in self:
|
||||||
|
value = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
if value in self.vectors.key2row:
|
||||||
|
priority.append((-lex.prob, self.vectors.key2row[value], value))
|
||||||
priority.sort()
|
priority.sort()
|
||||||
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
|
indices = xp.asarray([i for (prob, i, key) in priority], dtype="uint64")
|
||||||
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
|
||||||
|
@ -399,8 +408,10 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
if self.has_vector(orth):
|
cdef Lexeme lex = self[orth]
|
||||||
return self.vectors[orth]
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
if self.has_vector(key):
|
||||||
|
return self.vectors[key]
|
||||||
xp = get_array_module(self.vectors.data)
|
xp = get_array_module(self.vectors.data)
|
||||||
vectors = xp.zeros((self.vectors_length,), dtype="f")
|
vectors = xp.zeros((self.vectors_length,), dtype="f")
|
||||||
return vectors
|
return vectors
|
||||||
|
@ -416,15 +427,16 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
if self.vectors.is_full and orth not in self.vectors:
|
cdef Lexeme lex = self[orth]
|
||||||
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
if self.vectors.is_full and key not in self.vectors:
|
||||||
new_rows = max(100, int(self.vectors.shape[0]*1.3))
|
new_rows = max(100, int(self.vectors.shape[0]*1.3))
|
||||||
if self.vectors.shape[1] == 0:
|
if self.vectors.shape[1] == 0:
|
||||||
width = vector.size
|
width = vector.size
|
||||||
else:
|
else:
|
||||||
width = self.vectors.shape[1]
|
width = self.vectors.shape[1]
|
||||||
self.vectors.resize((new_rows, width))
|
self.vectors.resize((new_rows, width))
|
||||||
lex = self[orth] # Add word to vocab if necessary
|
row = self.vectors.add(key, vector=vector)
|
||||||
row = self.vectors.add(orth, vector=vector)
|
|
||||||
if row >= 0:
|
if row >= 0:
|
||||||
lex.rank = row
|
lex.rank = row
|
||||||
|
|
||||||
|
@ -439,7 +451,9 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth)
|
||||||
return orth in self.vectors
|
cdef Lexeme lex = self[orth]
|
||||||
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
|
return key in self.vectors
|
||||||
|
|
||||||
property lookups:
|
property lookups:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -453,7 +467,6 @@ cdef class Vocab:
|
||||||
self.lookups.get_table("lexeme_norm"),
|
self.lookups.get_table("lexeme_norm"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def to_disk(self, path, *, exclude=tuple()):
|
def to_disk(self, path, *, exclude=tuple()):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -466,7 +479,6 @@ cdef class Vocab:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
path.mkdir()
|
path.mkdir()
|
||||||
setters = ["strings", "vectors"]
|
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.to_disk(path / "strings.json")
|
self.strings.to_disk(path / "strings.json")
|
||||||
if "vectors" not in exclude:
|
if "vectors" not in exclude:
|
||||||
|
@ -485,7 +497,6 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#to_disk
|
DOCS: https://spacy.io/api/vocab#to_disk
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
getters = ["strings", "vectors"]
|
|
||||||
if "strings" not in exclude:
|
if "strings" not in exclude:
|
||||||
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
self.strings.from_disk(path / "strings.json") # TODO: add exclude?
|
||||||
if "vectors" not in exclude:
|
if "vectors" not in exclude:
|
||||||
|
|
|
@ -303,7 +303,7 @@ mapped to a zero vector. See the documentation on
|
||||||
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
|
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
|
||||||
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
|
| `dropout` | Optional dropout rate. If set, it's applied per dimension over the whole batch. Defaults to `None`. ~~Optional[float]~~ |
|
||||||
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
|
| `init_W` | The [initialization function](https://thinc.ai/docs/api-initializers). Defaults to [`glorot_uniform_init`](https://thinc.ai/docs/api-initializers#glorot_uniform_init). ~~Callable[[Ops, Tuple[int, ...]]], FloatsXd]~~ |
|
||||||
| `key_attr` | Defaults to `"ORTH"`. ~~str~~ |
|
| `key_attr` | This setting is ignored in spaCy v3.6+. To set a custom key attribute for vectors, configure it through [`Vectors`](/api/vectors) or [`spacy init vectors`](/api/cli#init-vectors). Defaults to `"ORTH"`. ~~str~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~ |
|
||||||
|
|
||||||
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}
|
### spacy.FeatureExtractor.v1 {id="FeatureExtractor"}
|
||||||
|
|
|
@ -212,7 +212,8 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
|
||||||
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
| `output_dir` | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~ |
|
||||||
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
|
||||||
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
|
||||||
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
|
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~str \(option)~~ |
|
||||||
|
| `--attr`, `-a` | Token attribute to use for vectors, e.g. `LOWER` or `NORM`) Defaults to `ORTH`. ~~str \(option)~~ |
|
||||||
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
|
||||||
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
|
||||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||||
|
|
|
@ -856,7 +856,7 @@ token-to-vector embedding component like [`Tok2Vec`](/api/tok2vec) or
|
||||||
training a pipeline with components sourced from an existing pipeline: if
|
training a pipeline with components sourced from an existing pipeline: if
|
||||||
multiple components (e.g. tagger, parser, NER) listen to the same
|
multiple components (e.g. tagger, parser, NER) listen to the same
|
||||||
token-to-vector component, but some of them are frozen and not updated, their
|
token-to-vector component, but some of them are frozen and not updated, their
|
||||||
performance may degrade significally as the token-to-vector component is updated
|
performance may degrade significantly as the token-to-vector component is updated
|
||||||
with new data. To prevent this, listeners can be replaced with a standalone
|
with new data. To prevent this, listeners can be replaced with a standalone
|
||||||
token-to-vector layer that is owned by the component and doesn't change if the
|
token-to-vector layer that is owned by the component and doesn't change if the
|
||||||
component isn't updated.
|
component isn't updated.
|
||||||
|
|
1488
website/docs/api/large-language-models.mdx
Normal file
1488
website/docs/api/large-language-models.mdx
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -67,7 +67,6 @@ architectures and their arguments and hyperparameters.
|
||||||
> ```python
|
> ```python
|
||||||
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL
|
||||||
> config = {
|
> config = {
|
||||||
> "threshold": 0.5,
|
|
||||||
> "spans_key": "labeled_spans",
|
> "spans_key": "labeled_spans",
|
||||||
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||||
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||||
|
|
|
@ -60,7 +60,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
| `model` | A model instance that is given a list of documents and predicts a probability for each token. ~~Model[List[Doc], Floats2d]~~ |
|
||||||
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ |
|
||||||
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
| `threshold` | Minimum probability to consider a prediction positive. Defaults to `0.5`. ~~float~~ |
|
||||||
| `max_length` | Maximum length of the produced spans, defaults to `None` meaning unlimited length. ~~Optional[int]~~ |
|
| `max_length` | Maximum length of the produced spans, defaults to `25`. ~~Optional[int]~~ |
|
||||||
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
| `min_length` | Minimum length of the produced spans, defaults to `None` meaning shortest span length is 1. ~~Optional[int]~~ |
|
||||||
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
|
|
|
@ -60,6 +60,7 @@ modified later.
|
||||||
| `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ |
|
| `hash_seed` <Tag variant="new">3.2</Tag> | The floret hash seed (default: `0`). ~~int~~ |
|
||||||
| `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ |
|
| `bow` <Tag variant="new">3.2</Tag> | The floret BOW string (default: `"<"`). ~~str~~ |
|
||||||
| `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ |
|
| `eow` <Tag variant="new">3.2</Tag> | The floret EOW string (default: `">"`). ~~str~~ |
|
||||||
|
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys (default: `"ORTH"`). ~~Union[int, str]~~ |
|
||||||
|
|
||||||
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
|
## Vectors.\_\_getitem\_\_ {id="getitem",tag="method"}
|
||||||
|
|
||||||
|
@ -453,8 +454,9 @@ Load state from a binary string.
|
||||||
|
|
||||||
## Attributes {id="attributes"}
|
## Attributes {id="attributes"}
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||||
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
|
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
|
||||||
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||||
|
| `attr` <Tag variant="new">3.6</Tag> | The token attribute for the vector keys. ~~int~~ |
|
||||||
|
|
|
@ -261,7 +261,7 @@ source code and recompiling frequently.
|
||||||
|
|
||||||
#### Visual Studio Code extension
|
#### Visual Studio Code extension
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
|
The [spaCy VSCode Extension](https://github.com/explosion/spacy-vscode) provides
|
||||||
additional tooling and features for working with spaCy's config files. Version
|
additional tooling and features for working with spaCy's config files. Version
|
||||||
|
@ -310,7 +310,7 @@ You can configure the build process with the following environment variables:
|
||||||
| Variable | Description |
|
| Variable | Description |
|
||||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. |
|
| `SPACY_EXTRAS` | Additional Python packages to install alongside spaCy with optional version specifications. Should be a string that can be passed to `pip install`. See [`Makefile`](%%GITHUB_SPACY/Makefile) for defaults. |
|
||||||
| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.6`. |
|
| `PYVER` | The Python version to build against. This version needs to be available on your build and runtime machines. Defaults to `3.8`. |
|
||||||
| `WHEELHOUSE` | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`. |
|
| `WHEELHOUSE` | Directory to store the wheel files during compilation. Defaults to `./wheelhouse`. |
|
||||||
|
|
||||||
### Run tests {id="run-tests"}
|
### Run tests {id="run-tests"}
|
||||||
|
|
512
website/docs/usage/large-language-models.mdx
Normal file
512
website/docs/usage/large-language-models.mdx
Normal file
|
@ -0,0 +1,512 @@
|
||||||
|
---
|
||||||
|
title: Large Language Models
|
||||||
|
teaser: Integrating LLMs into structured NLP pipelines
|
||||||
|
menu:
|
||||||
|
- ['Motivation', 'motivation']
|
||||||
|
- ['Install', 'install']
|
||||||
|
- ['Usage', 'usage']
|
||||||
|
- ['Logging', 'logging']
|
||||||
|
- ['API', 'api']
|
||||||
|
- ['Tasks', 'tasks']
|
||||||
|
- ['Models', 'models']
|
||||||
|
---
|
||||||
|
|
||||||
|
[The spacy-llm package](https://github.com/explosion/spacy-llm) integrates Large
|
||||||
|
Language Models (LLMs) into spaCy pipelines, featuring a modular system for
|
||||||
|
**fast prototyping** and **prompting**, and turning unstructured responses into
|
||||||
|
**robust outputs** for various NLP tasks, **no training data** required.
|
||||||
|
|
||||||
|
- Serializable `llm` **component** to integrate prompts into your pipeline
|
||||||
|
- **Modular functions** to define the [**task**](#tasks) (prompting and parsing)
|
||||||
|
and [**model**](#models) (model to use)
|
||||||
|
- Support for **hosted APIs** and self-hosted **open-source models**
|
||||||
|
- Integration with [`LangChain`](https://github.com/hwchase17/langchain)
|
||||||
|
- Access to
|
||||||
|
**[OpenAI API](https://platform.openai.com/docs/api-reference/introduction)**,
|
||||||
|
including GPT-4 and various GPT-3 models
|
||||||
|
- Built-in support for various **open-source** models hosted on
|
||||||
|
[Hugging Face](https://huggingface.co/)
|
||||||
|
- Usage examples for standard NLP tasks such as **Named Entity Recognition** and
|
||||||
|
**Text Classification**
|
||||||
|
- Easy implementation of **your own functions** via the
|
||||||
|
[registry](/api/top-level#registry) for custom prompting, parsing and model
|
||||||
|
integrations
|
||||||
|
|
||||||
|
## Motivation {id="motivation"}
|
||||||
|
|
||||||
|
Large Language Models (LLMs) feature powerful natural language understanding
|
||||||
|
capabilities. With only a few (and sometimes no) examples, an LLM can be
|
||||||
|
prompted to perform custom NLP tasks such as text categorization, named entity
|
||||||
|
recognition, coreference resolution, information extraction and more.
|
||||||
|
|
||||||
|
Supervised learning is much worse than LLM prompting for prototyping, but for
|
||||||
|
many tasks it's much better for production. A transformer model that runs
|
||||||
|
comfortably on a single GPU is extremely powerful, and it's likely to be a
|
||||||
|
better choice for any task for which you have a well-defined output. You train
|
||||||
|
the model with anything from a few hundred to a few thousand labelled examples,
|
||||||
|
and it will learn to do exactly that. Efficiency, reliability and control are
|
||||||
|
all better with supervised learning, and accuracy will generally be higher than
|
||||||
|
LLM prompting as well.
|
||||||
|
|
||||||
|
`spacy-llm` lets you have **the best of both worlds**. You can quickly
|
||||||
|
initialize a pipeline with components powered by LLM prompts, and freely mix in
|
||||||
|
components powered by other approaches. As your project progresses, you can look
|
||||||
|
at replacing some or all of the LLM-powered components as you require.
|
||||||
|
|
||||||
|
Of course, there can be components in your system for which the power of an LLM
|
||||||
|
is fully justified. If you want a system that can synthesize information from
|
||||||
|
multiple documents in subtle ways and generate a nuanced summary for you, bigger
|
||||||
|
is better. However, even if your production system needs an LLM for some of the
|
||||||
|
task, that doesn't mean you need an LLM for all of it. Maybe you want to use a
|
||||||
|
cheap text classification model to help you find the texts to summarize, or
|
||||||
|
maybe you want to add a rule-based system to sanity check the output of the
|
||||||
|
summary. These before-and-after tasks are much easier with a mature and
|
||||||
|
well-thought-out library, which is exactly what spaCy provides.
|
||||||
|
|
||||||
|
## Install {id="install"}
|
||||||
|
|
||||||
|
`spacy-llm` will be installed automatically in future spaCy versions. For now,
|
||||||
|
you can run the following in the same virtual environment where you already have
|
||||||
|
`spacy` [installed](/usage).
|
||||||
|
|
||||||
|
> ⚠️ This package is still experimental and it is possible that changes made to
|
||||||
|
> the interface will be breaking in minor version updates.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m pip install spacy-llm
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage {id="usage"}
|
||||||
|
|
||||||
|
The task and the model have to be supplied to the `llm` pipeline component using
|
||||||
|
the [config system](/api/data-formats#config). This package provides various
|
||||||
|
built-in functionality, as detailed in the [API](#-api) documentation.
|
||||||
|
|
||||||
|
### Example 1: Add a text classifier using a GPT-3 model from OpenAI {id="example-1"}
|
||||||
|
|
||||||
|
Create a new API key from openai.com or fetch an existing one, and ensure the
|
||||||
|
keys are set as environmental variables. For more background information, see
|
||||||
|
the [OpenAI](/api/large-language-models#gpt-3-5) section.
|
||||||
|
|
||||||
|
Create a config file `config.cfg` containing at least the following (or see the
|
||||||
|
full example
|
||||||
|
[here](https://github.com/explosion/spacy-llm/tree/main/usage_examples/textcat_openai)):
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["llm"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.llm]
|
||||||
|
factory = "llm"
|
||||||
|
|
||||||
|
[components.llm.task]
|
||||||
|
@llm_tasks = "spacy.TextCat.v2"
|
||||||
|
labels = ["COMPLIMENT", "INSULT"]
|
||||||
|
|
||||||
|
[components.llm.model]
|
||||||
|
@llm_models = "spacy.GPT-3-5.v1"
|
||||||
|
config = {"temperature": 0.3}
|
||||||
|
```
|
||||||
|
|
||||||
|
Now run:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from spacy_llm.util import assemble
|
||||||
|
|
||||||
|
nlp = assemble("config.cfg")
|
||||||
|
doc = nlp("You look gorgeous!")
|
||||||
|
print(doc.cats)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 2: Add NER using an open-source model through Hugging Face {id="example-2"}
|
||||||
|
|
||||||
|
To run this example, ensure that you have a GPU enabled, and `transformers`,
|
||||||
|
`torch` and CUDA installed. For more background information, see the
|
||||||
|
[DollyHF](/api/large-language-models#dolly) section.
|
||||||
|
|
||||||
|
Create a config file `config.cfg` containing at least the following (or see the
|
||||||
|
full example
|
||||||
|
[here](https://github.com/explosion/spacy-llm/tree/main/usage_examples/ner_dolly)):
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["llm"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.llm]
|
||||||
|
factory = "llm"
|
||||||
|
|
||||||
|
[components.llm.task]
|
||||||
|
@llm_tasks = "spacy.NER.v2"
|
||||||
|
labels = ["PERSON", "ORGANISATION", "LOCATION"]
|
||||||
|
|
||||||
|
[components.llm.model]
|
||||||
|
@llm_models = "spacy.Dolly.v1"
|
||||||
|
# For better performance, use dolly-v2-12b instead
|
||||||
|
name = "dolly-v2-3b"
|
||||||
|
```
|
||||||
|
|
||||||
|
Now run:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from spacy_llm.util import assemble
|
||||||
|
|
||||||
|
nlp = assemble("config.cfg")
|
||||||
|
doc = nlp("Jack and Jill rode up the hill in Les Deux Alpes")
|
||||||
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that Hugging Face will download the `"databricks/dolly-v2-3b"` model the
|
||||||
|
first time you use it. You can
|
||||||
|
[define the cached directory](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache)
|
||||||
|
by setting the environmental variable `HF_HOME`. Also, you can upgrade the model
|
||||||
|
to be `"databricks/dolly-v2-12b"` for better performance.
|
||||||
|
|
||||||
|
### Example 3: Create the component directly in Python {id="example-3"}
|
||||||
|
|
||||||
|
The `llm` component behaves as any other component does, so adding it to an
|
||||||
|
existing pipeline follows the same pattern:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
nlp.add_pipe(
|
||||||
|
"llm",
|
||||||
|
config={
|
||||||
|
"task": {
|
||||||
|
"@llm_tasks": "spacy.NER.v2",
|
||||||
|
"labels": ["PERSON", "ORGANISATION", "LOCATION"]
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"@llm_models": "spacy.gpt-3.5.v1",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
nlp.initialize()
|
||||||
|
doc = nlp("Jack and Jill rode up the hill in Les Deux Alpes")
|
||||||
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that for efficient usage of resources, typically you would use
|
||||||
|
[`nlp.pipe(docs)`](/api/language#pipe) with a batch, instead of calling
|
||||||
|
`nlp(doc)` with a single document.
|
||||||
|
|
||||||
|
### Example 4: Implement your own custom task {id="example-4"}
|
||||||
|
|
||||||
|
To write a [`task`](#tasks), you need to implement two functions:
|
||||||
|
`generate_prompts` that takes a list of [`Doc`](/api/doc) objects and transforms
|
||||||
|
them into a list of prompts, and `parse_responses` that transforms the LLM
|
||||||
|
outputs into annotations on the [`Doc`](/api/doc), e.g. entity spans, text
|
||||||
|
categories and more.
|
||||||
|
|
||||||
|
To register your custom task, decorate a factory function using the
|
||||||
|
`spacy_llm.registry.llm_tasks` decorator with a custom name that you can refer
|
||||||
|
to in your config.
|
||||||
|
|
||||||
|
> 📖 For more details, see the
|
||||||
|
> [**usage example on writing your own task**](https://github.com/explosion/spacy-llm/tree/main/usage_examples#writing-your-own-task)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from typing import Iterable, List
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy_llm.registry import registry
|
||||||
|
from spacy_llm.util import split_labels
|
||||||
|
|
||||||
|
|
||||||
|
@registry.llm_tasks("my_namespace.MyTask.v1")
|
||||||
|
def make_my_task(labels: str, my_other_config_val: float) -> "MyTask":
|
||||||
|
labels_list = split_labels(labels)
|
||||||
|
return MyTask(labels=labels_list, my_other_config_val=my_other_config_val)
|
||||||
|
|
||||||
|
|
||||||
|
class MyTask:
|
||||||
|
def __init__(self, labels: List[str], my_other_config_val: float):
|
||||||
|
...
|
||||||
|
|
||||||
|
def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[str]:
|
||||||
|
...
|
||||||
|
|
||||||
|
def parse_responses(
|
||||||
|
self, docs: Iterable[Doc], responses: Iterable[str]
|
||||||
|
) -> Iterable[Doc]:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# config.cfg (excerpt)
|
||||||
|
[components.llm.task]
|
||||||
|
@llm_tasks = "my_namespace.MyTask.v1"
|
||||||
|
labels = LABEL1,LABEL2,LABEL3
|
||||||
|
my_other_config_val = 0.3
|
||||||
|
```
|
||||||
|
|
||||||
|
## Logging {id="logging"}
|
||||||
|
|
||||||
|
spacy-llm has a built-in logger that can log the prompt sent to the LLM as well
|
||||||
|
as its raw response. This logger uses the debug level and by default has a
|
||||||
|
`logging.NullHandler()` configured.
|
||||||
|
|
||||||
|
In order to use this logger, you can setup a simple handler like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import logging
|
||||||
|
import spacy_llm
|
||||||
|
|
||||||
|
|
||||||
|
spacy_llm.logger.addHandler(logging.StreamHandler())
|
||||||
|
spacy_llm.logger.setLevel(logging.DEBUG)
|
||||||
|
```
|
||||||
|
|
||||||
|
> NOTE: Any `logging` handler will work here so you probably want to use some
|
||||||
|
> sort of rotating `FileHandler` as the generated prompts can be quite long,
|
||||||
|
> especially for tasks with few-shot examples.
|
||||||
|
|
||||||
|
Then when using the pipeline you'll be able to view the prompt and response.
|
||||||
|
|
||||||
|
E.g. with the config and code from [Example 1](#example-1) above:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from spacy_llm.util import assemble
|
||||||
|
|
||||||
|
|
||||||
|
nlp = assemble("config.cfg")
|
||||||
|
doc = nlp("You look gorgeous!")
|
||||||
|
print(doc.cats)
|
||||||
|
```
|
||||||
|
|
||||||
|
You will see `logging` output similar to:
|
||||||
|
|
||||||
|
```
|
||||||
|
Generated prompt for doc: You look gorgeous!
|
||||||
|
|
||||||
|
You are an expert Text Classification system. Your task is to accept Text as input
|
||||||
|
and provide a category for the text based on the predefined labels.
|
||||||
|
|
||||||
|
Classify the text below to any of the following labels: COMPLIMENT, INSULT
|
||||||
|
The task is non-exclusive, so you can provide more than one label as long as
|
||||||
|
they're comma-delimited. For example: Label1, Label2, Label3.
|
||||||
|
Do not put any other text in your answer, only one or more of the provided labels with nothing before or after.
|
||||||
|
If the text cannot be classified into any of the provided labels, answer `==NONE==`.
|
||||||
|
|
||||||
|
Here is the text that needs classification
|
||||||
|
|
||||||
|
|
||||||
|
Text:
|
||||||
|
'''
|
||||||
|
You look gorgeous!
|
||||||
|
'''
|
||||||
|
|
||||||
|
Model response for doc: You look gorgeous!
|
||||||
|
COMPLIMENT
|
||||||
|
```
|
||||||
|
|
||||||
|
`print(doc.cats)` to standard output should look like:
|
||||||
|
|
||||||
|
```
|
||||||
|
{'COMPLIMENT': 1.0, 'INSULT': 0.0}
|
||||||
|
```
|
||||||
|
|
||||||
|
## API {id="api"}
|
||||||
|
|
||||||
|
`spacy-llm` exposes a `llm` factory with
|
||||||
|
[configurable settings](/api/large-language-models#config).
|
||||||
|
|
||||||
|
An `llm` component is defined by two main settings:
|
||||||
|
|
||||||
|
- A [**task**](#tasks), defining the prompt to send to the LLM as well as the
|
||||||
|
functionality to parse the resulting response back into structured fields on
|
||||||
|
the [Doc](/api/doc) objects.
|
||||||
|
- A [**model**](#models) defining the model to use and how to connect to it.
|
||||||
|
Note that `spacy-llm` supports both access to external APIs (such as OpenAI)
|
||||||
|
as well as access to self-hosted open-source LLMs (such as using Dolly through
|
||||||
|
Hugging Face).
|
||||||
|
|
||||||
|
Moreover, `spacy-llm` exposes a customizable [**caching**](#cache) functionality
|
||||||
|
to avoid running the same document through an LLM service (be it local or
|
||||||
|
through a REST API) more than once.
|
||||||
|
|
||||||
|
Finally, you can choose to save a stringified version of LLM prompts/responses
|
||||||
|
within the `Doc.user_data["llm_io"]` attribute by setting `save_io` to `True`.
|
||||||
|
`Doc.user_data["llm_io"]` is a dictionary containing one entry for every LLM
|
||||||
|
component within the `nlp` pipeline. Each entry is itself a dictionary, with two
|
||||||
|
keys: `prompt` and `response`.
|
||||||
|
|
||||||
|
A note on `validate_types`: by default, `spacy-llm` checks whether the
|
||||||
|
signatures of the `model` and `task` callables are consistent with each other
|
||||||
|
and emits a warning if they don't. `validate_types` can be set to `False` if you
|
||||||
|
want to disable this behavior.
|
||||||
|
|
||||||
|
### Tasks {id="tasks"}
|
||||||
|
|
||||||
|
A _task_ defines an NLP problem or question, that will be sent to the LLM via a
|
||||||
|
prompt. Further, the task defines how to parse the LLM's responses back into
|
||||||
|
structured information. All tasks are registered in the `llm_tasks` registry.
|
||||||
|
|
||||||
|
Practically speaking, a task should adhere to the `Protocol` `LLMTask` defined
|
||||||
|
in [`ty.py`](https://github.com/explosion/spacy-llm/blob/main/spacy_llm/ty.py).
|
||||||
|
It needs to define a `generate_prompts` function and a `parse_responses`
|
||||||
|
function.
|
||||||
|
|
||||||
|
| Task | Description |
|
||||||
|
| --------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| [`task.generate_prompts`](/api/large-language-models#task-generate-prompts) | Takes a collection of documents, and returns a collection of "prompts", which can be of type `Any`. |
|
||||||
|
| [`task.parse_responses`](/api/large-language-models#task-parse-responses) | Takes a collection of LLM responses and the original documents, parses the responses into structured information, and sets the annotations on the documents. |
|
||||||
|
|
||||||
|
Moreover, the task may define an optional [`scorer` method](/api/scorer#score).
|
||||||
|
It should accept an iterable of `Example`s as input and return a score
|
||||||
|
dictionary. If the `scorer` method is defined, `spacy-llm` will call it to
|
||||||
|
evaluate the component.
|
||||||
|
|
||||||
|
| Component | Description |
|
||||||
|
| ----------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| [`spacy.Summarization.v1`](/api/large-language-models#summarization-v1) | The summarization task prompts the model for a concise summary of the provided text. |
|
||||||
|
| [`spacy.NER.v2`](/api/large-language-models#ner-v2) | The built-in NER task supports both zero-shot and few-shot prompting. This version also supports explicitly defining the provided labels with custom descriptions. |
|
||||||
|
| [`spacy.NER.v1`](/api/large-language-models#ner-v1) | The original version of the built-in NER task supports both zero-shot and few-shot prompting. |
|
||||||
|
| [`spacy.SpanCat.v2`](/api/large-language-models#spancat-v2) | The built-in SpanCat task is a simple adaptation of the NER task to support overlapping entities and store its annotations in `doc.spans`. |
|
||||||
|
| [`spacy.SpanCat.v1`](/api/large-language-models#spancat-v1) | The original version of the built-in SpanCat task is a simple adaptation of the v1 NER task to support overlapping entities and store its annotations in `doc.spans`. |
|
||||||
|
| [`spacy.TextCat.v3`](/api/large-language-models#textcat-v3) | Version 3 (the most recent) of the built-in TextCat task supports both zero-shot and few-shot prompting. It allows setting definitions of labels. |
|
||||||
|
| [`spacy.TextCat.v2`](/api/large-language-models#textcat-v2) | Version 2 of the built-in TextCat task supports both zero-shot and few-shot prompting and includes an improved prompt template. |
|
||||||
|
| [`spacy.TextCat.v1`](/api/large-language-models#textcat-v1) | Version 1 of the built-in TextCat task supports both zero-shot and few-shot prompting. |
|
||||||
|
| [`spacy.REL.v1`](/api/large-language-models#rel-v1) | The built-in REL task supports both zero-shot and few-shot prompting. It relies on an upstream NER component for entities extraction. |
|
||||||
|
| [`spacy.Lemma.v1`](/api/large-language-models#lemma-v1) | The `Lemma.v1` task lemmatizes the provided text and updates the `lemma_` attribute in the doc's tokens accordingly. |
|
||||||
|
| [`spacy.Sentiment.v1`](/api/large-language-models#sentiment-v1) | Performs sentiment analysis on provided texts. |
|
||||||
|
| [`spacy.NoOp.v1`](/api/large-language-models#noop-v1) | This task is only useful for testing - it tells the LLM to do nothing, and does not set any fields on the `docs`. |
|
||||||
|
|
||||||
|
#### Providing examples for few-shot prompts {id="few-shot-prompts"}
|
||||||
|
|
||||||
|
All built-in tasks support few-shot prompts, i. e. including examples in a
|
||||||
|
prompt. Examples can be supplied in two ways: (1) as a separate file containing
|
||||||
|
only examples or (2) by initializing `llm` with a `get_examples()` callback
|
||||||
|
(like any other pipeline component).
|
||||||
|
|
||||||
|
##### (1) Few-shot example file
|
||||||
|
|
||||||
|
A file containing examples for few-shot prompting can be configured like this:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[components.llm.task]
|
||||||
|
@llm_tasks = "spacy.NER.v2"
|
||||||
|
labels = PERSON,ORGANISATION,LOCATION
|
||||||
|
[components.llm.task.examples]
|
||||||
|
@misc = "spacy.FewShotReader.v1"
|
||||||
|
path = "ner_examples.yml"
|
||||||
|
```
|
||||||
|
|
||||||
|
The supplied file has to conform to the format expected by the required task
|
||||||
|
(see the task documentation further down).
|
||||||
|
|
||||||
|
##### (2) Initializing the `llm` component with a `get_examples()` callback
|
||||||
|
|
||||||
|
Alternatively, you can initialize your `nlp` pipeline by providing a
|
||||||
|
`get_examples` callback for [`nlp.initialize`](/api/language#initialize) and
|
||||||
|
setting `n_prompt_examples` to a positive number to automatically fetch a few
|
||||||
|
examples for few-shot learning. Set `n_prompt_examples` to `-1` to use all
|
||||||
|
examples as part of the few-shot learning prompt.
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[initialize.components.llm]
|
||||||
|
n_prompt_examples = 3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Model {id="models"}
|
||||||
|
|
||||||
|
A _model_ defines which LLM model to query, and how to query it. It can be a
|
||||||
|
simple function taking a collection of prompts (consistent with the output type
|
||||||
|
of `task.generate_prompts()`) and returning a collection of responses
|
||||||
|
(consistent with the expected input of `parse_responses`). Generally speaking,
|
||||||
|
it's a function of type `Callable[[Iterable[Any]], Iterable[Any]]`, but specific
|
||||||
|
implementations can have other signatures, like
|
||||||
|
`Callable[[Iterable[str]], Iterable[str]]`.
|
||||||
|
|
||||||
|
All built-in models are registered in `llm_models`. If no model is specified,
|
||||||
|
the repo currently connects to the `OpenAI` API by default using REST, and
|
||||||
|
accesses the `"gpt-3.5-turbo"` model.
|
||||||
|
|
||||||
|
Currently three different approaches to use LLMs are supported:
|
||||||
|
|
||||||
|
1. `spacy-llm`s native REST interface. This is the default for all hosted models
|
||||||
|
(e. g. OpenAI, Cohere, Anthropic, ...).
|
||||||
|
2. A HuggingFace integration that allows to run a limited set of HF models
|
||||||
|
locally.
|
||||||
|
3. A LangChain integration that allows to run any model supported by LangChain
|
||||||
|
(hosted or locally).
|
||||||
|
|
||||||
|
Approaches 1. and 2 are the default for hosted model and local models,
|
||||||
|
respectively. Alternatively you can use LangChain to access hosted or local
|
||||||
|
models by specifying one of the models registered with the `langchain.` prefix.
|
||||||
|
|
||||||
|
<Infobox>
|
||||||
|
_Why LangChain if there are also are a native REST and a HuggingFace interface? When should I use what?_
|
||||||
|
|
||||||
|
Third-party libraries like `langchain` focus on prompt management, integration
|
||||||
|
of many different LLM APIs, and other related features such as conversational
|
||||||
|
memory or agents. `spacy-llm` on the other hand emphasizes features we consider
|
||||||
|
useful in the context of NLP pipelines utilizing LLMs to process documents
|
||||||
|
(mostly) independent from each other. It makes sense that the feature sets of
|
||||||
|
such third-party libraries and `spacy-llm` aren't identical - and users might
|
||||||
|
want to take advantage of features not available in `spacy-llm`.
|
||||||
|
|
||||||
|
The advantage of implementing our own REST and HuggingFace integrations is that
|
||||||
|
we can ensure a larger degree of stability and robustness, as we can guarantee
|
||||||
|
backwards-compatibility and more smoothly integrated error handling.
|
||||||
|
|
||||||
|
If however there are features or APIs not natively covered by `spacy-llm`, it's
|
||||||
|
trivial to utilize LangChain to cover this - and easy to customize the prompting
|
||||||
|
mechanism, if so required.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
Note that when using hosted services, you have to ensure that the [proper API
|
||||||
|
keys](/api/large-language-models#api-keys) are set as environment variables as described by the corresponding
|
||||||
|
provider's documentation.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
| Component | Description |
|
||||||
|
| ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------ |
|
||||||
|
| [`spacy.GPT-4.v1`](/api/large-language-models#gpt-4) | OpenAI’s `gpt-4` model family. |
|
||||||
|
| [`spacy.GPT-3-5.v1`](/api/large-language-models#gpt-3-5) | OpenAI’s `gpt-3-5` model family. |
|
||||||
|
| [`spacy.Text-Davinci.v1`](/api/large-language-models#text-davinci) | OpenAI’s `text-davinci` model family. |
|
||||||
|
| [`spacy.Code-Davinci.v1`](/api/large-language-models#code-davinci) | OpenAI’s `code-davinci` model family. |
|
||||||
|
| [`spacy.Text-Curie.v1`](/api/large-language-models#text-curie) | OpenAI’s `text-curie` model family. |
|
||||||
|
| [`spacy.Text-Babbage.v1`](/api/large-language-models#text-babbage) | OpenAI’s `text-babbage` model family. |
|
||||||
|
| [`spacy.Text-Ada.v1`](/api/large-language-models#text-ada) | OpenAI’s `text-ada` model family. |
|
||||||
|
| [`spacy.Davinci.v1`](/api/large-language-models#davinci) | OpenAI’s `davinci` model family. |
|
||||||
|
| [`spacy.Curie.v1`](/api/large-language-models#curie) | OpenAI’s `curie` model family. |
|
||||||
|
| [`spacy.Babbage.v1`](/api/large-language-models#babbage) | OpenAI’s `babbage` model family. |
|
||||||
|
| [`spacy.Ada.v1`](/api/large-language-models#ada) | OpenAI’s `ada` model family. |
|
||||||
|
| [`spacy.Command.v1`](/api/large-language-models#command) | Cohere’s `command` model family. |
|
||||||
|
| [`spacy.Claude-1.v1`](/api/large-language-models#claude-1) | Anthropic’s `claude-1` model family. |
|
||||||
|
| [`spacy.Claude-instant-1.v1`](/api/large-language-models#claude-instant-1) | Anthropic’s `claude-instant-1` model family. |
|
||||||
|
| [`spacy.Claude-instant-1-1.v1`](/api/large-language-models#claude-instant-1-1) | Anthropic’s `claude-instant-1.1` model family. |
|
||||||
|
| [`spacy.Claude-1-0.v1`](/api/large-language-models#claude-1-0) | Anthropic’s `claude-1.0` model family. |
|
||||||
|
| [`spacy.Claude-1-2.v1`](/api/large-language-models#claude-1-2) | Anthropic’s `claude-1.2` model family. |
|
||||||
|
| [`spacy.Claude-1-3.v1`](/api/large-language-models#claude-1-3) | Anthropic’s `claude-1.3` model family. |
|
||||||
|
| [`spacy.Dolly.v1`](/api/large-language-models#dolly) | Dolly models through [Databricks](https://huggingface.co/databricks) on HuggingFace. |
|
||||||
|
| [`spacy.Falcon.v1`](/api/large-language-models#falcon) | Falcon model through HuggingFace. |
|
||||||
|
| [`spacy.StableLM.v1`](/api/large-language-models#stablelm) | StableLM model through HuggingFace. |
|
||||||
|
| [`spacy.OpenLLaMA.v1`](/api/large-language-models#openllama) | OpenLLaMA model through HuggingFace. |
|
||||||
|
| [LangChain models](/api/large-language-models#langchain-models) | LangChain models for API retrieval. |
|
||||||
|
|
||||||
|
### Cache {id="cache"}
|
||||||
|
|
||||||
|
Interacting with LLMs, either through an external API or a local instance, is
|
||||||
|
costly. Since developing an NLP pipeline generally means a lot of exploration
|
||||||
|
and prototyping, `spacy-llm` implements a built-in
|
||||||
|
[cache](/api/large-language-models#cache) to avoid reprocessing the same
|
||||||
|
documents at each run that keeps batches of documents stored on disk.
|
||||||
|
|
||||||
|
### Various functions {id="various-functions"}
|
||||||
|
|
||||||
|
| Component | Description |
|
||||||
|
| ----------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| [`spacy.FewShotReader.v1`](/api/large-language-models#fewshotreader-v1) | This function is registered in spaCy's `misc` registry, and reads in examples from a `.yml`, `.yaml`, `.json` or `.jsonl` file. It uses [`srsly`](https://github.com/explosion/srsly) to read in these files and parses them depending on the file extension. |
|
||||||
|
| [`spacy.FileReader.v1`](/api/large-language-models#filereader-v1) | This function is registered in spaCy's `misc` registry, and reads a file provided to the `path` to return a `str` representation of its contents. This function is typically used to read [Jinja](https://jinja.palletsprojects.com/en/3.1.x/) files containing the prompt template. |
|
||||||
|
| [Normalizer functions](/api/large-language-models#normalizer-functions) | These functions provide simple normalizations for string comparisons, e.g. between a list of specified labels and a label given in the raw text of the LLM response. |
|
|
@ -113,7 +113,7 @@ print(doc[2].morph) # 'Case=Nom|Person=2|PronType=Prs'
|
||||||
print(doc[2].pos_) # 'PRON'
|
print(doc[2].pos_) # 'PRON'
|
||||||
```
|
```
|
||||||
|
|
||||||
## Lemmatization {id="lemmatization",model="lemmatizer",version="3"}
|
## Lemmatization {id="lemmatization",version="3"}
|
||||||
|
|
||||||
spaCy provides two pipeline components for lemmatization:
|
spaCy provides two pipeline components for lemmatization:
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ nlp = spacy.blank("sv")
|
||||||
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||||
```
|
```
|
||||||
|
|
||||||
### Rule-based lemmatizer {id="lemmatizer-rule"}
|
### Rule-based lemmatizer {id="lemmatizer-rule",model="morphologizer"}
|
||||||
|
|
||||||
When training pipelines that include a component that assigns part-of-speech
|
When training pipelines that include a component that assigns part-of-speech
|
||||||
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
tags (a morphologizer or a tagger with a [POS mapping](#mappings-exceptions)), a
|
||||||
|
@ -194,7 +194,7 @@ information, without consulting the context of the token. The rule-based
|
||||||
lemmatizer also accepts list-based exception files. For English, these are
|
lemmatizer also accepts list-based exception files. For English, these are
|
||||||
acquired from [WordNet](https://wordnet.princeton.edu/).
|
acquired from [WordNet](https://wordnet.princeton.edu/).
|
||||||
|
|
||||||
### Trainable lemmatizer
|
### Trainable lemmatizer {id="lemmatizer-train",model="trainable_lemmatizer"}
|
||||||
|
|
||||||
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
|
The [`EditTreeLemmatizer`](/api/edittreelemmatizer) can learn form-to-lemma
|
||||||
transformations from a training corpus that includes lemma annotations. This
|
transformations from a training corpus that includes lemma annotations. This
|
||||||
|
|
|
@ -11,7 +11,6 @@ menu:
|
||||||
- ['Custom Functions', 'custom-functions']
|
- ['Custom Functions', 'custom-functions']
|
||||||
- ['Initialization', 'initialization']
|
- ['Initialization', 'initialization']
|
||||||
- ['Data Utilities', 'data']
|
- ['Data Utilities', 'data']
|
||||||
- ['Parallel Training', 'parallel-training']
|
|
||||||
- ['Internal API', 'api']
|
- ['Internal API', 'api']
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -1565,77 +1564,6 @@ token-based annotations like the dependency parse or entity labels, you'll need
|
||||||
to take care to adjust the `Example` object so its annotations match and remain
|
to take care to adjust the `Example` object so its annotations match and remain
|
||||||
valid.
|
valid.
|
||||||
|
|
||||||
## Parallel & distributed training with Ray {id="parallel-training"}
|
|
||||||
|
|
||||||
> #### Installation
|
|
||||||
>
|
|
||||||
> ```bash
|
|
||||||
> $ pip install -U %%SPACY_PKG_NAME[ray]%%SPACY_PKG_FLAGS
|
|
||||||
> # Check that the CLI is registered
|
|
||||||
> $ python -m spacy ray --help
|
|
||||||
> ```
|
|
||||||
|
|
||||||
[Ray](https://ray.io/) is a fast and simple framework for building and running
|
|
||||||
**distributed applications**. You can use Ray to train spaCy on one or more
|
|
||||||
remote machines, potentially speeding up your training process. Parallel
|
|
||||||
training won't always be faster though – it depends on your batch size, models,
|
|
||||||
and hardware.
|
|
||||||
|
|
||||||
<Infobox variant="warning">
|
|
||||||
|
|
||||||
To use Ray with spaCy, you need the
|
|
||||||
[`spacy-ray`](https://github.com/explosion/spacy-ray) package installed.
|
|
||||||
Installing the package will automatically add the `ray` command to the spaCy
|
|
||||||
CLI.
|
|
||||||
|
|
||||||
</Infobox>
|
|
||||||
|
|
||||||
The [`spacy ray train`](/api/cli#ray-train) command follows the same API as
|
|
||||||
[`spacy train`](/api/cli#train), with a few extra options to configure the Ray
|
|
||||||
setup. You can optionally set the `--address` option to point to your Ray
|
|
||||||
cluster. If it's not set, Ray will run locally.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m spacy ray train config.cfg --n-workers 2
|
|
||||||
```
|
|
||||||
|
|
||||||
<Project id="integrations/ray">
|
|
||||||
|
|
||||||
Get started with parallel training using our project template. It trains a
|
|
||||||
simple model on a Universal Dependencies Treebank and lets you parallelize the
|
|
||||||
training with Ray.
|
|
||||||
|
|
||||||
</Project>
|
|
||||||
|
|
||||||
### How parallel training works {id="parallel-training-details"}
|
|
||||||
|
|
||||||
Each worker receives a shard of the **data** and builds a copy of the **model
|
|
||||||
and optimizer** from the [`config.cfg`](#config). It also has a communication
|
|
||||||
channel to **pass gradients and parameters** to the other workers. Additionally,
|
|
||||||
each worker is given ownership of a subset of the parameter arrays. Every
|
|
||||||
parameter array is owned by exactly one worker, and the workers are given a
|
|
||||||
mapping so they know which worker owns which parameter.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
As training proceeds, every worker will be computing gradients for **all** of
|
|
||||||
the model parameters. When they compute gradients for parameters they don't own,
|
|
||||||
they'll **send them to the worker** that does own that parameter, along with a
|
|
||||||
version identifier so that the owner can decide whether to discard the gradient.
|
|
||||||
Workers use the gradients they receive and the ones they compute locally to
|
|
||||||
update the parameters they own, and then broadcast the updated array and a new
|
|
||||||
version ID to the other workers.
|
|
||||||
|
|
||||||
This training procedure is **asynchronous** and **non-blocking**. Workers always
|
|
||||||
push their gradient increments and parameter updates, they do not have to pull
|
|
||||||
them and block on the result, so the transfers can happen in the background,
|
|
||||||
overlapped with the actual training work. The workers also do not have to stop
|
|
||||||
and wait for each other ("synchronize") at the start of each batch. This is very
|
|
||||||
useful for spaCy, because spaCy is often trained on long documents, which means
|
|
||||||
**batches can vary in size** significantly. Uneven workloads make synchronous
|
|
||||||
gradient descent inefficient, because if one batch is slow, all of the other
|
|
||||||
workers are stuck waiting for it to complete before they can continue.
|
|
||||||
|
|
||||||
## Internal training API {id="api"}
|
## Internal training API {id="api"}
|
||||||
|
|
||||||
<Infobox variant="danger">
|
<Infobox variant="danger">
|
||||||
|
|
143
website/docs/usage/v3-6.mdx
Normal file
143
website/docs/usage/v3-6.mdx
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
---
|
||||||
|
title: What's New in v3.6
|
||||||
|
teaser: New features and how to upgrade
|
||||||
|
menu:
|
||||||
|
- ['New Features', 'features']
|
||||||
|
- ['Upgrading Notes', 'upgrading']
|
||||||
|
---
|
||||||
|
|
||||||
|
## New features {id="features",hidden="true"}
|
||||||
|
|
||||||
|
spaCy v3.6 adds the new [`SpanFinder`](/api/spanfinder) component to the core
|
||||||
|
spaCy library and new trained pipelines for Slovenian.
|
||||||
|
|
||||||
|
### SpanFinder {id="spanfinder"}
|
||||||
|
|
||||||
|
The [`SpanFinder`](/api/spanfinder) component identifies potentially
|
||||||
|
overlapping, unlabeled spans by identifying span start and end tokens. It is
|
||||||
|
intended for use in combination with a component like
|
||||||
|
[`SpanCategorizer`](/api/spancategorizer) that may further filter or label the
|
||||||
|
spans. See our
|
||||||
|
[Spancat blog post](https://explosion.ai/blog/spancat#span-finder) for a more
|
||||||
|
detailed introduction to the span finder.
|
||||||
|
|
||||||
|
To train a pipeline with `span_finder` + `spancat`, remember to add
|
||||||
|
`span_finder` (and its `tok2vec` or `transformer` if required) to
|
||||||
|
`[training.annotating_components]` so that the `spancat` component can be
|
||||||
|
trained directly from its predictions:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
[nlp]
|
||||||
|
pipeline = ["tok2vec","span_finder","spancat"]
|
||||||
|
|
||||||
|
[training]
|
||||||
|
annotating_components = ["tok2vec","span_finder"]
|
||||||
|
```
|
||||||
|
|
||||||
|
In practice it can be helpful to initially train the `span_finder` separately
|
||||||
|
before [sourcing](/usage/processing-pipelines#sourced-components) it (along with
|
||||||
|
its `tok2vec`) into the `spancat` pipeline for further training. Otherwise the
|
||||||
|
memory usage can spike for `spancat` in the first few training steps if the
|
||||||
|
`span_finder` makes a large number of predictions.
|
||||||
|
|
||||||
|
### Additional features and improvements {id="additional-features-and-improvements"}
|
||||||
|
|
||||||
|
- Language updates:
|
||||||
|
- Add initial support for Malay.
|
||||||
|
- Update Latin defaults to support noun chunks, update lexical/tokenizer
|
||||||
|
settings and add example sentences.
|
||||||
|
- Support `spancat_singlelabel` in `spacy debug data` CLI.
|
||||||
|
- Add `doc.spans` rendering to `spacy evaluate` CLI displaCy output.
|
||||||
|
- Support custom token/lexeme attribute for vectors.
|
||||||
|
- Add option to return scores separately keyed by component name with
|
||||||
|
`spacy evaluate --per-component`, `Language.evaluate(per_component=True)` and
|
||||||
|
`Scorer.score(per_component=True)`. This is useful when the pipeline contains
|
||||||
|
more than one of the same component like `textcat` that may have overlapping
|
||||||
|
scores keys.
|
||||||
|
- Typing updates for `PhraseMatcher` and `SpanGroup`.
|
||||||
|
|
||||||
|
## Trained pipelines {id="pipelines"}
|
||||||
|
|
||||||
|
### New trained pipelines {id="new-pipelines"}
|
||||||
|
|
||||||
|
v3.6 introduces new pipelines for Slovenian, which use the trainable lemmatizer
|
||||||
|
and [floret vectors](https://github.com/explosion/floret).
|
||||||
|
|
||||||
|
| Package | UPOS | Parser LAS | NER F |
|
||||||
|
| ------------------------------------------------- | ---: | ---------: | ----: |
|
||||||
|
| [`sl_core_news_sm`](/models/sl#sl_core_news_sm) | 96.9 | 82.1 | 62.9 |
|
||||||
|
| [`sl_core_news_md`](/models/sl#sl_core_news_md) | 97.6 | 84.3 | 73.5 |
|
||||||
|
| [`sl_core_news_lg`](/models/sl#sl_core_news_lg) | 97.7 | 84.3 | 79.0 |
|
||||||
|
| [`sl_core_news_trf`](/models/sl#sl_core_news_trf) | 99.0 | 91.7 | 90.0 |
|
||||||
|
|
||||||
|
### Pipeline updates {id="pipeline-updates"}
|
||||||
|
|
||||||
|
The English pipelines have been updated to improve handling of contractions with
|
||||||
|
various apostrophes and to lemmatize "get" as a passive auxiliary.
|
||||||
|
|
||||||
|
The Danish pipeline `da_core_news_trf` has been updated to use
|
||||||
|
[`vesteinn/DanskBERT`](https://huggingface.co/vesteinn/DanskBERT) with
|
||||||
|
performance improvements across the board.
|
||||||
|
|
||||||
|
## Notes about upgrading from v3.5 {id="upgrading"}
|
||||||
|
|
||||||
|
### SpanGroup spans are now required to be from the same doc {id="spangroup-spans"}
|
||||||
|
|
||||||
|
When initializing a `SpanGroup`, there is a new check to verify that all added
|
||||||
|
spans refer to the current doc. Without this check, it was possible to run into
|
||||||
|
string store or other errors.
|
||||||
|
|
||||||
|
One place this may crop up is when creating `Example` objects for training with
|
||||||
|
custom spans:
|
||||||
|
|
||||||
|
```diff
|
||||||
|
doc = Doc(nlp.vocab, words=tokens) # predicted doc
|
||||||
|
example = Example.from_dict(doc, {"ner": iob_tags})
|
||||||
|
# use the reference doc when creating reference spans
|
||||||
|
- span = Span(doc, 0, 5, "ORG")
|
||||||
|
+ span = Span(example.reference, 0, 5, "ORG")
|
||||||
|
example.reference.spans[spans_key] = [span]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pipeline package version compatibility {id="version-compat"}
|
||||||
|
|
||||||
|
> #### Using legacy implementations
|
||||||
|
>
|
||||||
|
> In spaCy v3, you'll still be able to load and reference legacy implementations
|
||||||
|
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
|
||||||
|
> components or architectures change and newer versions are available in the
|
||||||
|
> core library.
|
||||||
|
|
||||||
|
When you're loading a pipeline package trained with an earlier version of spaCy
|
||||||
|
v3, you will see a warning telling you that the pipeline may be incompatible.
|
||||||
|
This doesn't necessarily have to be true, but we recommend running your
|
||||||
|
pipelines against your test suite or evaluation data to make sure there are no
|
||||||
|
unexpected results.
|
||||||
|
|
||||||
|
If you're using one of the [trained pipelines](/models) we provide, you should
|
||||||
|
run [`spacy download`](/api/cli#download) to update to the latest version. To
|
||||||
|
see an overview of all installed packages and their compatibility, you can run
|
||||||
|
[`spacy validate`](/api/cli#validate).
|
||||||
|
|
||||||
|
If you've trained your own custom pipeline and you've confirmed that it's still
|
||||||
|
working as expected, you can update the spaCy version requirements in the
|
||||||
|
[`meta.json`](/api/data-formats#meta):
|
||||||
|
|
||||||
|
```diff
|
||||||
|
- "spacy_version": ">=3.5.0,<3.6.0",
|
||||||
|
+ "spacy_version": ">=3.5.0,<3.7.0",
|
||||||
|
```
|
||||||
|
|
||||||
|
### Updating v3.5 configs
|
||||||
|
|
||||||
|
To update a config from spaCy v3.5 with the new v3.6 settings, run
|
||||||
|
[`init fill-config`](/api/cli#init-fill-config):
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy init fill-config config-v3.5.cfg config-v3.6.cfg
|
||||||
|
```
|
||||||
|
|
||||||
|
In many cases ([`spacy train`](/api/cli#train),
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
|
||||||
|
automatically, but you'll need to fill in the new settings to run
|
||||||
|
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
|
|
@ -222,7 +222,9 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "la",
|
"code": "la",
|
||||||
"name": "Latin"
|
"name": "Latin",
|
||||||
|
"example": "In principio creavit Deus caelum et terram.",
|
||||||
|
"has_examples": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "lb",
|
"code": "lb",
|
||||||
|
@ -339,7 +341,10 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "sl",
|
"code": "sl",
|
||||||
"name": "Slovenian"
|
"name": "Slovenian",
|
||||||
|
"example": "France Prešeren je umrl 8. februarja 1849 v Kranju",
|
||||||
|
"has_examples": true,
|
||||||
|
"models": ["sl_core_news_sm", "sl_core_news_md", "sl_core_news_lg", "sl_core_news_trf"]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"code": "sq",
|
"code": "sq",
|
||||||
|
|
|
@ -14,7 +14,8 @@
|
||||||
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
{ "text": "New in v3.2", "url": "/usage/v3-2" },
|
||||||
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
{ "text": "New in v3.3", "url": "/usage/v3-3" },
|
||||||
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
{ "text": "New in v3.4", "url": "/usage/v3-4" },
|
||||||
{ "text": "New in v3.5", "url": "/usage/v3-5" }
|
{ "text": "New in v3.5", "url": "/usage/v3-5" },
|
||||||
|
{ "text": "New in v3.6", "url": "/usage/v3-6" }
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -25,16 +26,19 @@
|
||||||
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
{ "text": "Processing Pipelines", "url": "/usage/processing-pipelines" },
|
||||||
{
|
{
|
||||||
"text": "Embeddings & Transformers",
|
"text": "Embeddings & Transformers",
|
||||||
"url": "/usage/embeddings-transformers",
|
"url": "/usage/embeddings-transformers"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "Large Language Models",
|
||||||
|
"url": "/usage/large-language-models",
|
||||||
"tag": "new"
|
"tag": "new"
|
||||||
},
|
},
|
||||||
{ "text": "Training Models", "url": "/usage/training", "tag": "new" },
|
{ "text": "Training Models", "url": "/usage/training" },
|
||||||
{
|
{
|
||||||
"text": "Layers & Model Architectures",
|
"text": "Layers & Model Architectures",
|
||||||
"url": "/usage/layers-architectures",
|
"url": "/usage/layers-architectures"
|
||||||
"tag": "new"
|
|
||||||
},
|
},
|
||||||
{ "text": "spaCy Projects", "url": "/usage/projects", "tag": "new" },
|
{ "text": "spaCy Projects", "url": "/usage/projects" },
|
||||||
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
{ "text": "Saving & Loading", "url": "/usage/saving-loading" },
|
||||||
{ "text": "Visualizers", "url": "/usage/visualizers" }
|
{ "text": "Visualizers", "url": "/usage/visualizers" }
|
||||||
]
|
]
|
||||||
|
@ -101,6 +105,7 @@
|
||||||
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
{ "text": "EntityLinker", "url": "/api/entitylinker" },
|
||||||
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
{ "text": "EntityRecognizer", "url": "/api/entityrecognizer" },
|
||||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||||
|
{ "text": "Large Language Models", "url": "/api/large-language-models" },
|
||||||
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
{ "text": "Lemmatizer", "url": "/api/lemmatizer" },
|
||||||
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
{ "text": "Morphologizer", "url": "/api/morphologizer" },
|
||||||
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
{ "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" },
|
||||||
|
|
|
@ -27,7 +27,7 @@
|
||||||
"indexName": "spacy"
|
"indexName": "spacy"
|
||||||
},
|
},
|
||||||
"binderUrl": "explosion/spacy-io-binder",
|
"binderUrl": "explosion/spacy-io-binder",
|
||||||
"binderVersion": "3.5",
|
"binderVersion": "3.6",
|
||||||
"sections": [
|
"sections": [
|
||||||
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
|
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
|
||||||
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
|
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
|
||||||
|
|
|
@ -67,6 +67,33 @@
|
||||||
"category": ["pipeline", "research"],
|
"category": ["pipeline", "research"],
|
||||||
"tags": ["latin"]
|
"tags": ["latin"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "odycy",
|
||||||
|
"title": "OdyCy",
|
||||||
|
"slogan": "General-purpose language pipelines for premodern Greek.",
|
||||||
|
"description": "Academically validated modular NLP pipelines for premodern Greek. odyCy achieves state of the art performance on multiple tasks on unseen test data from the Universal Dependencies Perseus treebank, and performs second best on the PROIEL treebank’s test set on even more tasks. In addition performance also seems relatively stable across the two evaluation datasets in comparison with other NLP pipelines. OdyCy is being used at the Center for Humanities Computing for preprocessing and analyzing Ancient Greek corpora for New Testament research, meaning that you can expect consistent maintenance and improvements.",
|
||||||
|
"github": "centre-for-humanities-computing/odyCy",
|
||||||
|
"code_example": [
|
||||||
|
"# To install the high-accuracy transformer-based pipeline",
|
||||||
|
"# pip install https://huggingface.co/chcaa/grc_odycy_joint_trf/resolve/main/grc_odycy_joint_trf-any-py3-none-any.whl",
|
||||||
|
"import spacy",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.load('grc_odycy_joint_trf')",
|
||||||
|
"",
|
||||||
|
"doc = nlp('τὴν γοῦν Ἀττικὴν ἐκ τοῦ ἐπὶ πλεῖστον διὰ τὸ λεπτόγεων ἀστασίαστον οὖσαν ἄνθρωποι ᾤκουν οἱ αὐτοὶ αἰεί.')"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"url": "https://centre-for-humanities-computing.github.io/odyCy/",
|
||||||
|
"thumb": "https://raw.githubusercontent.com/centre-for-humanities-computing/odyCy/7b94fec60679d06272dca88a4dcfe0f329779aea/docs/_static/logo.svg",
|
||||||
|
"image": "https://github.com/centre-for-humanities-computing/odyCy/raw/main/docs/_static/logo_with_text_below.svg",
|
||||||
|
"author": "Jan Kostkan, Márton Kardos (Center for Humanities Computing, Aarhus University)",
|
||||||
|
"author_links": {
|
||||||
|
"github": "centre-for-humanities-computing",
|
||||||
|
"website": "https://chc.au.dk/"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "standalone", "research"],
|
||||||
|
"tags": ["ancient Greek"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-wasm",
|
"id": "spacy-wasm",
|
||||||
"title": "spacy-wasm",
|
"title": "spacy-wasm",
|
||||||
|
@ -4372,7 +4399,7 @@
|
||||||
"code_example": [
|
"code_example": [
|
||||||
"import spacy",
|
"import spacy",
|
||||||
"",
|
"",
|
||||||
"nlp = spacy.load(\"en_core_web_sm\", disable=[\"ner\"])",
|
"nlp = spacy.load(\"en_core_web_sm\", exclude=[\"ner\"])",
|
||||||
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
|
"nlp.add_pipe(\"span_marker\", config={\"model\": \"tomaarsen/span-marker-roberta-large-ontonotes5\"})",
|
||||||
"",
|
"",
|
||||||
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
|
"text = \"\"\"Cleopatra VII, also known as Cleopatra the Great, was the last active ruler of the \\",
|
||||||
|
|
|
@ -106,50 +106,21 @@ const Landing = () => {
|
||||||
|
|
||||||
<LandingBannerGrid>
|
<LandingBannerGrid>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
to="https://explosion.ai/custom-solutions"
|
label="NEW"
|
||||||
|
title="Large Language Models: Integrating LLMs into structured NLP pipelines"
|
||||||
|
to="/usage/large-language-models"
|
||||||
button="Learn more"
|
button="Learn more"
|
||||||
background="#E4F4F9"
|
|
||||||
color="#1e1935"
|
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
<p>
|
<p>
|
||||||
<Link to="https://explosion.ai/custom-solutions" hidden>
|
<Link to="https://github.com/explosion/spacy-llm">
|
||||||
<ImageFill
|
The spacy-llm package
|
||||||
image={tailoredPipelinesImage}
|
</Link>{' '}
|
||||||
alt="spaCy Tailored Pipelines"
|
integrates Large Language Models (LLMs) into spaCy, featuring a modular
|
||||||
/>
|
system for <strong>fast prototyping</strong> and <strong>prompting</strong>,
|
||||||
</Link>
|
and turning unstructured responses into <strong>robust outputs</strong> for
|
||||||
|
various NLP tasks, <strong>no training data</strong> required.
|
||||||
</p>
|
</p>
|
||||||
<p>
|
|
||||||
<strong>
|
|
||||||
Get a custom spaCy pipeline, tailor-made for your NLP problem by
|
|
||||||
spaCy's core developers.
|
|
||||||
</strong>
|
|
||||||
</p>
|
|
||||||
<Ul>
|
|
||||||
<Li emoji="🔥">
|
|
||||||
<strong>Streamlined.</strong> Nobody knows spaCy better than we do. Send
|
|
||||||
us your pipeline requirements and we'll be ready to start producing
|
|
||||||
your solution in no time at all.
|
|
||||||
</Li>
|
|
||||||
<Li emoji="🐿 ">
|
|
||||||
<strong>Production ready.</strong> spaCy pipelines are robust and easy
|
|
||||||
to deploy. You'll get a complete spaCy project folder which is
|
|
||||||
ready to <InlineCode>spacy project run</InlineCode>.
|
|
||||||
</Li>
|
|
||||||
<Li emoji="🔮">
|
|
||||||
<strong>Predictable.</strong> You'll know exactly what you're
|
|
||||||
going to get and what it's going to cost. We quote fees up-front,
|
|
||||||
let you try before you buy, and don't charge for over-runs at our
|
|
||||||
end — all the risk is on us.
|
|
||||||
</Li>
|
|
||||||
<Li emoji="🛠">
|
|
||||||
<strong>Maintainable.</strong> spaCy is an industry standard, and
|
|
||||||
we'll deliver your pipeline with full code, data, tests and
|
|
||||||
documentation, so your team can retrain, update and extend the solution
|
|
||||||
as your requirements change.
|
|
||||||
</Li>
|
|
||||||
</Ul>
|
|
||||||
</LandingBanner>
|
</LandingBanner>
|
||||||
|
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
|
@ -240,21 +211,50 @@ const Landing = () => {
|
||||||
|
|
||||||
<LandingBannerGrid>
|
<LandingBannerGrid>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
label="New in v3.0"
|
to="https://explosion.ai/custom-solutions"
|
||||||
title="Transformer-based pipelines, new training system, project templates & more"
|
button="Learn more"
|
||||||
to="/usage/v3"
|
background="#E4F4F9"
|
||||||
button="See what's new"
|
color="#1e1935"
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
<p>
|
<p>
|
||||||
spaCy v3.0 features all new <strong>transformer-based pipelines</strong>{' '}
|
<Link to="https://explosion.ai/custom-solutions" noLinkLayout>
|
||||||
that bring spaCy's accuracy right up to the current{' '}
|
<ImageFill
|
||||||
<strong>state-of-the-art</strong>. You can use any pretrained transformer to
|
image={tailoredPipelinesImage}
|
||||||
train your own pipelines, and even share one transformer between multiple
|
alt="spaCy Tailored Pipelines"
|
||||||
components with <strong>multi-task learning</strong>. Training is now fully
|
/>
|
||||||
configurable and extensible, and you can define your own custom models using{' '}
|
</Link>
|
||||||
<strong>PyTorch</strong>, <strong>TensorFlow</strong> and other frameworks.
|
|
||||||
</p>
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>
|
||||||
|
Get a custom spaCy pipeline, tailor-made for your NLP problem by
|
||||||
|
spaCy's core developers.
|
||||||
|
</strong>
|
||||||
|
</p>
|
||||||
|
<Ul>
|
||||||
|
<Li emoji="🔥">
|
||||||
|
<strong>Streamlined.</strong> Nobody knows spaCy better than we do. Send
|
||||||
|
us your pipeline requirements and we'll be ready to start producing
|
||||||
|
your solution in no time at all.
|
||||||
|
</Li>
|
||||||
|
<Li emoji="🐿 ">
|
||||||
|
<strong>Production ready.</strong> spaCy pipelines are robust and easy
|
||||||
|
to deploy. You'll get a complete spaCy project folder which is
|
||||||
|
ready to <InlineCode>spacy project run</InlineCode>.
|
||||||
|
</Li>
|
||||||
|
<Li emoji="🔮">
|
||||||
|
<strong>Predictable.</strong> You'll know exactly what you're
|
||||||
|
going to get and what it's going to cost. We quote fees up-front,
|
||||||
|
let you try before you buy, and don't charge for over-runs at our
|
||||||
|
end — all the risk is on us.
|
||||||
|
</Li>
|
||||||
|
<Li emoji="🛠">
|
||||||
|
<strong>Maintainable.</strong> spaCy is an industry standard, and
|
||||||
|
we'll deliver your pipeline with full code, data, tests and
|
||||||
|
documentation, so your team can retrain, update and extend the solution
|
||||||
|
as your requirements change.
|
||||||
|
</Li>
|
||||||
|
</Ul>
|
||||||
</LandingBanner>
|
</LandingBanner>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
to="https://course.spacy.io"
|
to="https://course.spacy.io"
|
||||||
|
@ -264,7 +264,7 @@ const Landing = () => {
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
<p>
|
<p>
|
||||||
<Link to="https://course.spacy.io" hidden>
|
<Link to="https://course.spacy.io" noLinkLayout>
|
||||||
<ImageFill
|
<ImageFill
|
||||||
image={courseImage}
|
image={courseImage}
|
||||||
alt="Advanced NLP with spaCy: A free online course"
|
alt="Advanced NLP with spaCy: A free online course"
|
||||||
|
|
|
@ -13,6 +13,8 @@ import 'prismjs/components/prism-json.min.js'
|
||||||
import 'prismjs/components/prism-markdown.min.js'
|
import 'prismjs/components/prism-markdown.min.js'
|
||||||
import 'prismjs/components/prism-python.min.js'
|
import 'prismjs/components/prism-python.min.js'
|
||||||
import 'prismjs/components/prism-yaml.min.js'
|
import 'prismjs/components/prism-yaml.min.js'
|
||||||
|
import 'prismjs/components/prism-docker.min.js'
|
||||||
|
import 'prismjs/components/prism-r.min.js'
|
||||||
|
|
||||||
import { isString } from './util'
|
import { isString } from './util'
|
||||||
import Link, { OptionalLink } from './link'
|
import Link, { OptionalLink } from './link'
|
||||||
|
@ -172,7 +174,7 @@ const convertLine = ({ line, prompt, lang }) => {
|
||||||
return handlePromot({ lineFlat, prompt })
|
return handlePromot({ lineFlat, prompt })
|
||||||
}
|
}
|
||||||
|
|
||||||
return lang === 'none' || !lineFlat ? (
|
return lang === 'none' || !lineFlat || !(lang in Prism.languages) ? (
|
||||||
lineFlat
|
lineFlat
|
||||||
) : (
|
) : (
|
||||||
<span
|
<span
|
||||||
|
|
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="/usage/v3-5" noLinkLayout>
|
<Link to="/usage/v3-6" noLinkLayout>
|
||||||
<strong>💥 Out now:</strong> spaCy v3.5
|
<strong>💥 Out now:</strong> spaCy v3.6
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user