mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 17:41:59 +03:00
Merge branch 'upstream_master' into test-cli-app-init-config
This commit is contained in:
commit
0278eecabf
64
.github/workflows/tests.yml
vendored
64
.github/workflows/tests.yml
vendored
|
@ -37,10 +37,20 @@ jobs:
|
|||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: isort
|
||||
run: |
|
||||
python -m pip install isort -c requirements.txt
|
||||
python -m isort spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
- name: cython-lint
|
||||
run: |
|
||||
python -m pip install cython-lint -c requirements.txt
|
||||
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||
cython-lint spacy --ignore E501,W291,E266
|
||||
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
|
@ -107,22 +117,22 @@ jobs:
|
|||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
# - name: "Test download CLI"
|
||||
# run: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test download_url in info CLI"
|
||||
# run: |
|
||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test no warnings on load (#11713)"
|
||||
# run: |
|
||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
- name: "Test download CLI"
|
||||
run: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test download_url in info CLI"
|
||||
run: |
|
||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test no warnings on load (#11713)"
|
||||
run: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
|
@ -146,17 +156,17 @@ jobs:
|
|||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
# - name: "Test assemble CLI"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test assemble CLI vectors warning"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# if: matrix.python_version == '3.9'
|
||||
- name: "Test assemble CLI"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test assemble CLI vectors warning"
|
||||
run: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
|
|
4
Makefile
4
Makefile
|
@ -1,11 +1,11 @@
|
|||
SHELL := /bin/bash
|
||||
|
||||
ifndef SPACY_EXTRAS
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
||||
endif
|
||||
|
||||
ifndef PYVER
|
||||
override PYVER = 3.6
|
||||
override PYVER = 3.8
|
||||
endif
|
||||
|
||||
VENV := ./env$(PYVER)
|
||||
|
|
72
README.md
72
README.md
|
@ -6,23 +6,20 @@ spaCy is a library for **advanced Natural Language Processing** in Python and
|
|||
Cython. It's built on the very latest research, and was designed from day one to
|
||||
be used in real products.
|
||||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and
|
||||
currently supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more,
|
||||
multi-task learning with pretrained **transformers** like BERT, as well as a
|
||||
spaCy comes with [pretrained pipelines](https://spacy.io/models) and currently
|
||||
supports tokenization and training for **70+ languages**. It features
|
||||
state-of-the-art speed and **neural network models** for tagging, parsing,
|
||||
**named entity recognition**, **text classification** and more, multi-task
|
||||
learning with pretrained **transformers** like BERT, as well as a
|
||||
production-ready [**training system**](https://spacy.io/usage/training) and easy
|
||||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
open-source software, released under the
|
||||
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💥 **We'd love to hear more about your experience with spaCy!**
|
||||
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||
|
||||
💫 **Version 3.5 out now!**
|
||||
💫 **Version 3.6 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
||||
[](https://github.com/explosion/spaCy/releases)
|
||||
[](https://pypi.org/project/spacy/)
|
||||
[](https://anaconda.org/conda-forge/spacy)
|
||||
|
@ -35,22 +32,22 @@ open-source software, released under the [MIT license](https://github.com/explos
|
|||
|
||||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| ----------------------------- | ---------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
| Documentation | |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -58,7 +55,7 @@ open-source software, released under the [MIT license](https://github.com/explos
|
|||
[api reference]: https://spacy.io/api/
|
||||
[models]: https://spacy.io/models
|
||||
[universe]: https://spacy.io/universe
|
||||
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
|
||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||
[online course]: https://course.spacy.io
|
||||
[project templates]: https://github.com/explosion/projects
|
||||
|
@ -92,7 +89,9 @@ more people can benefit from it.
|
|||
- State-of-the-art speed
|
||||
- Production-ready **training system**
|
||||
- Linguistically-motivated **tokenization**
|
||||
- Components for named **entity recognition**, part-of-speech-tagging, dependency parsing, sentence segmentation, **text classification**, lemmatization, morphological analysis, entity linking and more
|
||||
- Components for named **entity recognition**, part-of-speech-tagging,
|
||||
dependency parsing, sentence segmentation, **text classification**,
|
||||
lemmatization, morphological analysis, entity linking and more
|
||||
- Easily extensible with **custom components** and attributes
|
||||
- Support for custom models in **PyTorch**, **TensorFlow** and other frameworks
|
||||
- Built in **visualizers** for syntax and NER
|
||||
|
@ -118,8 +117,8 @@ For detailed installation instructions, see the
|
|||
### pip
|
||||
|
||||
Using pip, spaCy releases are available as source packages and binary wheels.
|
||||
Before you install spaCy and its dependencies, make sure that
|
||||
your `pip`, `setuptools` and `wheel` are up to date.
|
||||
Before you install spaCy and its dependencies, make sure that your `pip`,
|
||||
`setuptools` and `wheel` are up to date.
|
||||
|
||||
```bash
|
||||
pip install -U pip setuptools wheel
|
||||
|
@ -174,9 +173,9 @@ with the new version.
|
|||
|
||||
## 📦 Download model packages
|
||||
|
||||
Trained pipelines for spaCy can be installed as **Python packages**. This
|
||||
means that they're a component of your application, just like any other module.
|
||||
Models can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
||||
Trained pipelines for spaCy can be installed as **Python packages**. This means
|
||||
that they're a component of your application, just like any other module. Models
|
||||
can be installed using spaCy's [`download`](https://spacy.io/api/cli#download)
|
||||
command, or manually by pointing pip to a path or URL.
|
||||
|
||||
| Documentation | |
|
||||
|
@ -242,8 +241,7 @@ do that depends on your system.
|
|||
| **Mac** | Install a recent version of [XCode](https://developer.apple.com/xcode/), including the so-called "Command Line Tools". macOS and OS X ship with Python and git preinstalled. |
|
||||
| **Windows** | Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that matches the version that was used to compile your Python interpreter. |
|
||||
|
||||
For more details
|
||||
and instructions, see the documentation on
|
||||
For more details and instructions, see the documentation on
|
||||
[compiling spaCy from source](https://spacy.io/usage#source) and the
|
||||
[quickstart widget](https://spacy.io/usage#section-quickstart) to get the right
|
||||
commands for your platform and Python version.
|
||||
|
|
|
@ -9,3 +9,6 @@ requires = [
|
|||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
|
|
@ -38,3 +38,5 @@ types-setuptools>=57.0.0
|
|||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black==22.3.0
|
||||
cython-lint>=0.15.0; python_version >= "3.7"
|
||||
isort>=5.0,<6.0
|
||||
|
|
31
setup.py
31
setup.py
|
@ -1,10 +1,9 @@
|
|||
#!/usr/bin/env python
|
||||
from setuptools import Extension, setup, find_packages
|
||||
import sys
|
||||
import platform
|
||||
import numpy
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.sysconfig import get_python_inc
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from sysconfig import get_path
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
from Cython.Build import cythonize
|
||||
|
@ -88,30 +87,6 @@ COPY_FILES = {
|
|||
}
|
||||
|
||||
|
||||
def is_new_osx():
|
||||
"""Check whether we're on OSX >= 10.7"""
|
||||
if sys.platform != "darwin":
|
||||
return False
|
||||
mac_ver = platform.mac_ver()[0]
|
||||
if mac_ver.startswith("10"):
|
||||
minor_version = int(mac_ver.split(".")[1])
|
||||
if minor_version >= 7:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
|
||||
if is_new_osx():
|
||||
# On Mac, use libc++ because Apple deprecated use of
|
||||
# libstdc
|
||||
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
|
||||
LINK_OPTIONS["other"].append("-lc++")
|
||||
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
|
||||
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
|
||||
LINK_OPTIONS["other"].append("-nodefaultlibs")
|
||||
|
||||
|
||||
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
||||
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
||||
class build_ext_options:
|
||||
|
@ -204,7 +179,7 @@ def setup_package():
|
|||
|
||||
include_dirs = [
|
||||
numpy.get_include(),
|
||||
get_python_inc(plat_specific=True),
|
||||
get_path("include"),
|
||||
]
|
||||
ext_modules = []
|
||||
ext_modules.append(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Union, Iterable, Dict, Any
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, Union
|
||||
|
||||
# set library-specific custom warning handling before doing anything else
|
||||
from .errors import setup_default_warnings
|
||||
|
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
|
|||
setup_default_warnings() # noqa: E402
|
||||
|
||||
# These are imported as part of the API
|
||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
||||
from thinc.api import Config
|
||||
from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
|
||||
|
||||
from . import pipeline # noqa: F401
|
||||
from .cli.info import info # noqa: F401
|
||||
from .glossary import explain # noqa: F401
|
||||
from .about import __version__ # noqa: F401
|
||||
from .util import registry, logger # noqa: F401
|
||||
|
||||
from .errors import Errors
|
||||
from .language import Language
|
||||
from .vocab import Vocab
|
||||
from . import util
|
||||
|
||||
from .about import __version__ # noqa: F401
|
||||
from .cli.info import info # noqa: F401
|
||||
from .errors import Errors
|
||||
from .glossary import explain # noqa: F401
|
||||
from .language import Language
|
||||
from .util import logger, registry # noqa: F401
|
||||
from .vocab import Vocab
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
raise SystemError(Errors.E130)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.6.0.dev0"
|
||||
__version__ = "3.6.0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# Reserve 64 values for flag features
|
||||
from . cimport symbols
|
||||
|
||||
|
||||
cdef enum attr_id_t:
|
||||
NULL_ATTR
|
||||
IS_ALPHA
|
||||
|
@ -95,4 +96,4 @@ cdef enum attr_id_t:
|
|||
ENT_ID = symbols.ENT_ID
|
||||
|
||||
IDX
|
||||
SENT_END
|
||||
SENT_END
|
||||
|
|
|
@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
if "pos" in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if "morph" in stringy_attrs:
|
||||
morphs = stringy_attrs.pop("morph")
|
||||
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
||||
if "number" in stringy_attrs:
|
||||
stringy_attrs.pop("number")
|
||||
if "tenspect" in stringy_attrs:
|
||||
|
|
|
@ -1,35 +1,35 @@
|
|||
from wasabi import msg
|
||||
|
||||
from ._util import app, setup_cli # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .assemble import assemble_cli # noqa: F401
|
||||
|
||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||
# are registered automatically and won't have to be imported here.
|
||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train import train_cli # noqa: F401
|
||||
from .assemble import assemble_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .apply import apply # noqa: F401
|
||||
from .convert import convert # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .init_config import init_config, fill_config # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
from .project.push import project_push # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .debug_config import debug_config # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .debug_diff import debug_diff # noqa: F401
|
||||
from .debug_model import debug_model # noqa: F401
|
||||
from .download import download # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
from .info import info # noqa: F401
|
||||
from .init_config import fill_config, init_config # noqa: F401
|
||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .project.assets import project_assets # noqa: F401
|
||||
from .project.clone import project_clone # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .project.dvc import project_update_dvc # noqa: F401
|
||||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.push import project_push # noqa: F401
|
||||
from .project.run import project_run # noqa: F401
|
||||
from .train import train_cli # noqa: F401
|
||||
from .validate import validate # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -1,26 +1,44 @@
|
|||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
||||
from typing import TYPE_CHECKING, overload
|
||||
import sys
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import msg, Printer
|
||||
import srsly
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from configparser import InterpolationError
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
Union,
|
||||
overload,
|
||||
)
|
||||
|
||||
import srsly
|
||||
import typer
|
||||
from click import NoSuchOption
|
||||
from click.parser import split_arg_string
|
||||
from typer.main import get_command
|
||||
from contextlib import contextmanager
|
||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||
from thinc.util import gpu_is_available
|
||||
from configparser import InterpolationError
|
||||
import os
|
||||
from typer.main import get_command
|
||||
from wasabi import Printer, msg
|
||||
|
||||
from .. import about
|
||||
from ..compat import Literal
|
||||
from ..schemas import ProjectConfigSchema, validate
|
||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
||||
from .. import about
|
||||
from ..util import (
|
||||
ENV_VARS,
|
||||
SimpleFrozenDict,
|
||||
import_file,
|
||||
is_compatible_version,
|
||||
logger,
|
||||
make_tempdir,
|
||||
registry,
|
||||
run_command,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
|
|
@ -1,18 +1,15 @@
|
|||
import tqdm
|
||||
import srsly
|
||||
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Iterable, cast, Union
|
||||
from typing import Iterable, List, Optional, Union, cast
|
||||
|
||||
import srsly
|
||||
import tqdm
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
||||
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..vocab import Vocab
|
||||
from ..util import ensure_path, load_model
|
||||
|
||||
from ..vocab import Vocab
|
||||
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
|
||||
|
||||
path_help = """Location of the documents to predict on.
|
||||
Can be a single file in .spacy format or a .jsonl file.
|
||||
|
|
|
@ -1,13 +1,20 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code
|
||||
from .. import util
|
||||
from ..util import get_sourced_components, load_model_from_config
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@app.command(
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
from typing import Iterable, List, Optional
|
||||
import random
|
||||
from itertools import islice
|
||||
import numpy
|
||||
from pathlib import Path
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import numpy
|
||||
import typer
|
||||
from tqdm import tqdm
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
|
|
|
@ -1,18 +1,22 @@
|
|||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer
|
||||
import srsly
|
||||
import itertools
|
||||
import re
|
||||
import sys
|
||||
import itertools
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, Mapping, Optional, Union
|
||||
|
||||
import srsly
|
||||
from wasabi import Printer
|
||||
|
||||
from ._util import app, Arg, Opt, walk_directory
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import Doc, DocBin
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
||||
from ..training.converters import conllu_to_docs
|
||||
|
||||
from ..training import docs_to_json
|
||||
from ..training.converters import (
|
||||
conll_ner_to_docs,
|
||||
conllu_to_docs,
|
||||
iob_to_docs,
|
||||
json_to_docs,
|
||||
)
|
||||
from ._util import Arg, Opt, app, walk_directory
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
# matched by file extension and content. To add a converter, add a new
|
||||
|
|
|
@ -1,15 +1,22 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
from pathlib import Path
|
||||
from wasabi import msg, table
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import typer
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
import typer
|
||||
from wasabi import msg, table
|
||||
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli
|
||||
from .. import util
|
||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
debug_cli,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
|
|
@ -1,31 +1,49 @@
|
|||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
||||
from typing import cast, overload
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
import sys
|
||||
import srsly
|
||||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
import math
|
||||
import numpy
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Set,
|
||||
Tuple,
|
||||
Union,
|
||||
cast,
|
||||
overload,
|
||||
)
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, _format_number
|
||||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..pipeline import TrainablePipe
|
||||
import numpy
|
||||
import srsly
|
||||
import typer
|
||||
from wasabi import MESSAGES, Printer, msg
|
||||
|
||||
from .. import util
|
||||
from ..compat import Literal
|
||||
from ..language import Language
|
||||
from ..morphology import Morphology
|
||||
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
|
||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..training import Example, remove_bilu_prefix
|
||||
from ..training.initialize import get_sourced_components
|
||||
from ..util import registry, resolve_dot_names
|
||||
from ..compat import Literal
|
||||
from ..vectors import Mode as VectorsMode
|
||||
from .. import util
|
||||
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
_format_number,
|
||||
app,
|
||||
debug_cli,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
# Minimum number of expected occurrences of NER label in data to train new label
|
||||
NEW_LABEL_THRESHOLD = 50
|
||||
|
@ -212,7 +230,7 @@ def debug_data(
|
|||
else:
|
||||
msg.info("No word vectors present in the package")
|
||||
|
||||
if "spancat" in factory_names:
|
||||
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||
has_low_data_warning = False
|
||||
has_no_neg_warning = False
|
||||
|
@ -830,7 +848,7 @@ def _compile_gold(
|
|||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
if "spancat" in factory_names:
|
||||
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||
for spans_key in list(eg.reference.spans.keys()):
|
||||
# Obtain the span frequency
|
||||
if spans_key not in data["spancat"]:
|
||||
|
@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
|||
pipe_names = [
|
||||
pipe_name
|
||||
for pipe_name in nlp.pipe_names
|
||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
||||
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
|
||||
]
|
||||
labels: Dict[str, Set[str]] = {}
|
||||
for pipe_name in pipe_names:
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
||||
from pathlib import Path
|
||||
from thinc.api import Config
|
||||
from wasabi import MarkdownRenderer, Printer, diff_strings
|
||||
|
||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ..util import load_config
|
||||
from .init_config import init_config, Optimizations
|
||||
from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
|
||||
from .init_config import Optimizations, init_config
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
|
|
@ -1,19 +1,32 @@
|
|||
from typing import Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import typer
|
||||
from thinc.api import (
|
||||
Model,
|
||||
data_validation,
|
||||
fix_random_seed,
|
||||
set_dropout_rate,
|
||||
set_gpu_allocator,
|
||||
)
|
||||
from wasabi import msg
|
||||
|
||||
from spacy.training import Example
|
||||
from spacy.util import resolve_dot_names
|
||||
from wasabi import msg
|
||||
from thinc.api import fix_random_seed, set_dropout_rate
|
||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
||||
from .. import util
|
||||
from ..schemas import ConfigSchemaTraining
|
||||
from ..util import registry
|
||||
from .. import util
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
debug_cli,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
string_to_list,
|
||||
)
|
||||
|
||||
|
||||
@debug_cli.command(
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from typing import Optional, Sequence
|
||||
import requests
|
||||
import sys
|
||||
from wasabi import msg
|
||||
import typer
|
||||
from typing import Optional, Sequence
|
||||
|
||||
import requests
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from .. import about
|
||||
from ..util import is_package, get_minor_version, run_command
|
||||
from ..util import is_prerelease_version
|
||||
from ..errors import OLD_MODEL_SHORTCUTS
|
||||
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
|
||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||
|
||||
|
||||
@app.command(
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
from typing import Optional, List, Dict, Any, Union
|
||||
from wasabi import Printer
|
||||
from pathlib import Path
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
import srsly
|
||||
from thinc.api import fix_random_seed
|
||||
from wasabi import Printer
|
||||
|
||||
from ..training import Corpus
|
||||
from ..tokens import Doc
|
||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
||||
from .. import displacy, util
|
||||
from ..scorer import Scorer
|
||||
from .. import util
|
||||
from .. import displacy
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
import functools
|
||||
import logging
|
||||
import operator
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import Optional, Tuple, Any, Dict, List
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy
|
||||
import wasabi.tables
|
||||
|
||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
||||
from ..errors import Errors
|
||||
from ..training import Corpus
|
||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
||||
from .. import util
|
||||
from ..errors import Errors
|
||||
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, app, import_code, setup_gpu
|
||||
|
||||
_DEFAULTS = {
|
||||
"n_trials": 11,
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import json
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
import srsly
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list
|
||||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
import srsly
|
||||
from wasabi import MarkdownRenderer, Printer
|
||||
|
||||
from .. import about, util
|
||||
from ..compat import importlib_metadata
|
||||
from ._util import Arg, Opt, app, string_to_list
|
||||
from .download import get_latest_version, get_model_filename
|
||||
|
||||
|
||||
@app.command("info")
|
||||
|
|
|
@ -1,19 +1,26 @@
|
|||
from typing import Optional, List, Tuple
|
||||
import re
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, diff_strings
|
||||
from thinc.api import Config
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
import srsly
|
||||
import re
|
||||
from jinja2 import Template
|
||||
from thinc.api import Config
|
||||
from wasabi import Printer, diff_strings
|
||||
|
||||
from .. import util
|
||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from ..schemas import RecommendationSchema
|
||||
from ..util import SimpleFrozenList
|
||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
||||
from ._util import string_to_list, import_code
|
||||
|
||||
from ._util import (
|
||||
COMMAND,
|
||||
Arg,
|
||||
Opt,
|
||||
import_code,
|
||||
init_cli,
|
||||
show_validation_error,
|
||||
string_to_list,
|
||||
)
|
||||
|
||||
ROOT = Path(__file__).parent / "templates"
|
||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||
|
|
|
@ -1,15 +1,23 @@
|
|||
from typing import Optional
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
from typing import Optional
|
||||
|
||||
import srsly
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp, convert_vectors
|
||||
from ..language import Language
|
||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.initialize import convert_vectors, init_nlp
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
import_code,
|
||||
init_cli,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@init_cli.command("vectors")
|
||||
|
@ -24,6 +32,7 @@ def init_vectors_cli(
|
|||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
|
||||
# fmt: on
|
||||
):
|
||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||
|
@ -42,6 +51,7 @@ def init_vectors_cli(
|
|||
prune=prune,
|
||||
name=name,
|
||||
mode=mode,
|
||||
attr=attr,
|
||||
)
|
||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||
nlp.to_disk(output_dir)
|
||||
|
|
|
@ -1,18 +1,18 @@
|
|||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
||||
from thinc.api import Config
|
||||
from collections import defaultdict
|
||||
from catalogue import RegistryError
|
||||
import srsly
|
||||
import sys
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
||||
from ..schemas import validate, ModelMetaSchema
|
||||
from .. import util
|
||||
from .. import about
|
||||
import srsly
|
||||
from catalogue import RegistryError
|
||||
from thinc.api import Config
|
||||
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||
|
||||
from .. import about, util
|
||||
from ..schemas import ModelMetaSchema, validate
|
||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||
|
||||
|
||||
@app.command("package")
|
||||
|
|
|
@ -1,13 +1,21 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.pretrain import pretrain
|
||||
from ..util import load_config
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@app.command(
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
from typing import Optional, Sequence, Union, Iterator
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import cProfile
|
||||
import itertools
|
||||
import pstats
|
||||
import sys
|
||||
import itertools
|
||||
from wasabi import msg, Printer
|
||||
import typer
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional, Sequence, Union
|
||||
|
||||
import srsly
|
||||
import tqdm
|
||||
import typer
|
||||
from wasabi import Printer, msg
|
||||
|
||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
||||
from ..language import Language
|
||||
from ..util import load_model
|
||||
from ._util import NAME, Arg, Opt, app, debug_cli
|
||||
|
||||
|
||||
@debug_cli.command("profile")
|
||||
|
|
|
@ -1,16 +1,27 @@
|
|||
from typing import Any, Dict, Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from ...util import ensure_path, working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
||||
from .._util import (
|
||||
PROJECT_FILE,
|
||||
Arg,
|
||||
Opt,
|
||||
SimpleFrozenDict,
|
||||
download_file,
|
||||
get_checksum,
|
||||
get_git_version,
|
||||
git_checkout,
|
||||
load_project_config,
|
||||
parse_config_overrides,
|
||||
project_cli,
|
||||
)
|
||||
|
||||
# Whether assets are extra if `extra` is not set.
|
||||
EXTRA_DEFAULT = False
|
||||
|
|
|
@ -1,13 +1,22 @@
|
|||
from typing import Optional
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import subprocess
|
||||
import re
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from ... import about
|
||||
from ...util import ensure_path
|
||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
||||
from .._util import (
|
||||
COMMAND,
|
||||
PROJECT_FILE,
|
||||
Arg,
|
||||
Opt,
|
||||
get_git_version,
|
||||
git_checkout,
|
||||
git_repo_branch_exists,
|
||||
project_cli,
|
||||
)
|
||||
|
||||
DEFAULT_REPO = about.__projects__
|
||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg, MarkdownRenderer
|
||||
|
||||
from wasabi import MarkdownRenderer, msg
|
||||
|
||||
from ...util import working_dir
|
||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
||||
|
||||
from .._util import PROJECT_FILE, Arg, Opt, load_project_config, project_cli
|
||||
|
||||
DOCS_URL = "https://spacy.io"
|
||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
||||
|
|
|
@ -1,15 +1,28 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional, Iterable
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
||||
from .._util import Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
from ...util import SimpleFrozenList
|
||||
|
||||
from ...util import (
|
||||
SimpleFrozenList,
|
||||
join_command,
|
||||
run_command,
|
||||
split_command,
|
||||
working_dir,
|
||||
)
|
||||
from .._util import (
|
||||
COMMAND,
|
||||
NAME,
|
||||
PROJECT_FILE,
|
||||
Arg,
|
||||
Opt,
|
||||
get_hash,
|
||||
load_project_config,
|
||||
project_cli,
|
||||
)
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
DVC_DIR = ".dvc"
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from pathlib import Path
|
||||
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_command_hash
|
||||
from .._util import project_cli, Arg, logger
|
||||
from .._util import load_project_config
|
||||
|
||||
from .._util import Arg, load_project_config, logger, project_cli
|
||||
from .remote_storage import RemoteStorage, get_command_hash
|
||||
from .run import update_lockfile
|
||||
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from pathlib import Path
|
||||
|
||||
from wasabi import msg
|
||||
from .remote_storage import RemoteStorage
|
||||
from .remote_storage import get_content_hash, get_command_hash
|
||||
from .._util import load_project_config
|
||||
from .._util import project_cli, Arg, logger
|
||||
|
||||
from .._util import Arg, load_project_config, logger, project_cli
|
||||
from .remote_storage import RemoteStorage, get_command_hash, get_content_hash
|
||||
|
||||
|
||||
@project_cli.command("push")
|
||||
|
|
|
@ -1,18 +1,25 @@
|
|||
from typing import Optional, List, Dict, TYPE_CHECKING
|
||||
import hashlib
|
||||
import os
|
||||
import site
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import tarfile
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional
|
||||
|
||||
from wasabi import msg
|
||||
|
||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
||||
from .._util import ensure_pathy, make_tempdir
|
||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
||||
from ...git_info import GIT_VERSION
|
||||
from ... import about
|
||||
from ...errors import Errors
|
||||
from ...git_info import GIT_VERSION
|
||||
from ...util import ENV_VARS, check_bool_env_var, get_minor_version
|
||||
from .._util import (
|
||||
download_file,
|
||||
ensure_pathy,
|
||||
get_checksum,
|
||||
get_hash,
|
||||
make_tempdir,
|
||||
upload_file,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from pathy import FluidPath # noqa: F401
|
||||
|
|
|
@ -1,20 +1,39 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
||||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
import srsly
|
||||
import typer
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
|
||||
from ... import about
|
||||
from ...git_info import GIT_VERSION
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
||||
from ...util import (
|
||||
ENV_VARS,
|
||||
SimpleFrozenDict,
|
||||
SimpleFrozenList,
|
||||
check_bool_env_var,
|
||||
is_cwd,
|
||||
is_minor_version_match,
|
||||
join_command,
|
||||
run_command,
|
||||
split_command,
|
||||
working_dir,
|
||||
)
|
||||
from .._util import (
|
||||
COMMAND,
|
||||
PROJECT_FILE,
|
||||
PROJECT_LOCK,
|
||||
Arg,
|
||||
Opt,
|
||||
get_checksum,
|
||||
get_hash,
|
||||
load_project_config,
|
||||
parse_config_overrides,
|
||||
project_cli,
|
||||
)
|
||||
|
||||
|
||||
@project_cli.command(
|
||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
|||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -28,7 +28,7 @@ lang = "{{ lang }}"
|
|||
tok2vec/transformer. #}
|
||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components -%}
|
||||
|
@ -127,6 +127,30 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "span_finder" in components -%}
|
||||
[components.span_finder]
|
||||
factory = "span_finder"
|
||||
max_length = 25
|
||||
min_length = null
|
||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.span_finder.model]
|
||||
@architectures = "spacy.SpanFinder.v1"
|
||||
|
||||
[components.span_finder.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = 2
|
||||
|
||||
[components.span_finder.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.span_finder.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat" in components -%}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
|
@ -392,6 +416,27 @@ nO = null
|
|||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "span_finder" in components %}
|
||||
[components.span_finder]
|
||||
factory = "span_finder"
|
||||
max_length = 25
|
||||
min_length = null
|
||||
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
threshold = 0.5
|
||||
|
||||
[components.span_finder.model]
|
||||
@architectures = "spacy.SpanFinder.v1"
|
||||
|
||||
[components.span_finder.model.scorer]
|
||||
@layers = "spacy.LinearLogistic.v1"
|
||||
nO = 2
|
||||
|
||||
[components.span_finder.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat" in components %}
|
||||
[components.spancat]
|
||||
factory = "spancat"
|
||||
|
|
|
@ -1,15 +1,23 @@
|
|||
from typing import Optional, Dict, Any, Union
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import typer
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import typer
|
||||
from wasabi import msg
|
||||
|
||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
||||
from ._util import import_code, setup_gpu
|
||||
from ..training.loop import train as train_nlp
|
||||
from ..training.initialize import init_nlp
|
||||
from .. import util
|
||||
from ..training.initialize import init_nlp
|
||||
from ..training.loop import train as train_nlp
|
||||
from ._util import (
|
||||
Arg,
|
||||
Opt,
|
||||
app,
|
||||
import_code,
|
||||
parse_config_overrides,
|
||||
setup_gpu,
|
||||
show_validation_error,
|
||||
)
|
||||
|
||||
|
||||
@app.command(
|
||||
|
|
|
@ -1,14 +1,21 @@
|
|||
from typing import Tuple
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import requests
|
||||
from wasabi import msg, Printer
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import requests
|
||||
from wasabi import Printer, msg
|
||||
|
||||
from ._util import app
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||
from ..util import (
|
||||
get_installed_models,
|
||||
get_minor_version,
|
||||
get_model_meta,
|
||||
get_package_path,
|
||||
get_package_version,
|
||||
is_compatible_version,
|
||||
)
|
||||
from ._util import app
|
||||
|
||||
|
||||
@app.command("validate")
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
"""Helpers for Python and platform compatibility."""
|
||||
import sys
|
||||
|
||||
from thinc.util import copy_array
|
||||
|
||||
try:
|
||||
|
|
|
@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
|
|||
DOCS: https://spacy.io/api/top-level#displacy
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
"""
|
||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
||||
import warnings
|
||||
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||
|
||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import is_in_jupyter
|
||||
from ..util import find_available_port
|
||||
|
||||
from ..tokens import Doc, Span
|
||||
from ..util import find_available_port, is_in_jupyter
|
||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||
|
||||
_html = {}
|
||||
RENDER_WRAPPER = None
|
||||
|
@ -68,7 +66,7 @@ def render(
|
|||
if jupyter or (jupyter is None and is_in_jupyter()):
|
||||
# return HTML rendered by IPython display()
|
||||
# See #4840 for details on span wrapper to disable mathjax
|
||||
from IPython.core.display import display, HTML
|
||||
from IPython.core.display import HTML, display
|
||||
|
||||
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
||||
return html
|
||||
|
|
|
@ -1,15 +1,28 @@
|
|||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
import uuid
|
||||
import itertools
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from ..errors import Errors
|
||||
from ..util import escape_html, minify_html, registry
|
||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
||||
from .templates import TPL_TITLE
|
||||
from .templates import (
|
||||
TPL_DEP_ARCS,
|
||||
TPL_DEP_SVG,
|
||||
TPL_DEP_WORDS,
|
||||
TPL_DEP_WORDS_LEMMA,
|
||||
TPL_ENT,
|
||||
TPL_ENT_RTL,
|
||||
TPL_ENTS,
|
||||
TPL_FIGURE,
|
||||
TPL_KB_LINK,
|
||||
TPL_PAGE,
|
||||
TPL_SPAN,
|
||||
TPL_SPAN_RTL,
|
||||
TPL_SPAN_SLICE,
|
||||
TPL_SPAN_SLICE_RTL,
|
||||
TPL_SPAN_START,
|
||||
TPL_SPAN_START_RTL,
|
||||
TPL_SPANS,
|
||||
TPL_TITLE,
|
||||
)
|
||||
|
||||
DEFAULT_LANG = "en"
|
||||
DEFAULT_DIR = "ltr"
|
||||
|
@ -204,7 +217,7 @@ class SpanRenderer:
|
|||
+ (self.offset_step * (len(entities) - 1))
|
||||
)
|
||||
markup += self.span_template.format(
|
||||
text=token["text"],
|
||||
text=escape_html(token["text"]),
|
||||
span_slices=slices,
|
||||
span_starts=starts,
|
||||
total_height=total_height,
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import warnings
|
||||
|
||||
from .compat import Literal
|
||||
|
||||
|
||||
|
@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
|||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
||||
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||
"'spacy init vectors --attr'")
|
||||
|
||||
|
||||
class Errors(metaclass=ErrorsWithCodes):
|
||||
|
@ -738,8 +742,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||
"load the model, use its full name instead:\n\n"
|
||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||
"models, see the models directory: https://spacy.io/models. If you "
|
||||
"want to create a blank model, use spacy.blank: "
|
||||
"models, see the models directory: https://spacy.io/models and if "
|
||||
"you want to create a blank model, use spacy.blank: "
|
||||
"nlp = spacy.blank(\"{name}\")")
|
||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||
"return an initialized nlp object but got: {value}. Maybe "
|
||||
|
@ -970,6 +974,13 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||
E1052 = ("Unable to copy spans: the character offsets for the span at "
|
||||
"index {i} in the span group do not align with the tokenization "
|
||||
"in the target doc.")
|
||||
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
|
||||
" 'min_length': {min_length}, 'max_length': {max_length}")
|
||||
E1054 = ("The text, including whitespace, must match between reference and "
|
||||
"predicted docs when training {component}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import warnings
|
||||
|
||||
from .errors import Warnings
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||
from .kb import KnowledgeBase
|
||||
from .kb_in_memory import InMemoryLookupKB
|
||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
from .kb cimport KnowledgeBase
|
||||
from libcpp.vector cimport vector
|
||||
from ..typedefs cimport hash_t
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||
from ..typedefs cimport hash_t
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
|
||||
# Object used by the Entity Linker that summarizes one entity-alias candidate
|
||||
# combination.
|
||||
cdef class Candidate:
|
||||
cdef readonly KnowledgeBase kb
|
||||
cdef hash_t entity_hash
|
||||
|
|
|
@ -1,19 +1,31 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
from ..tokens import Span
|
||||
|
||||
|
||||
cdef class Candidate:
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
||||
algorithm which will disambiguate the various candidates to the correct one.
|
||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or
|
||||
may not be resolved to a specific `entity` from a Knowledge Base. This
|
||||
will be used as input for the entity linking algorithm which will
|
||||
disambiguate the various candidates to the correct one.
|
||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||
|
||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||
"""
|
||||
|
||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
||||
def __init__(
|
||||
self,
|
||||
KnowledgeBase kb,
|
||||
entity_hash,
|
||||
entity_freq,
|
||||
entity_vector,
|
||||
alias_hash,
|
||||
prior_prob
|
||||
):
|
||||
self.kb = kb
|
||||
self.entity_hash = entity_hash
|
||||
self.entity_freq = entity_freq
|
||||
|
@ -56,7 +68,8 @@ cdef class Candidate:
|
|||
|
||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
||||
Return candidate entities for a given mention and fetching appropriate
|
||||
entries from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Span): Entity mention for which to identify candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
|
@ -64,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
|||
return kb.get_candidates(mention)
|
||||
|
||||
|
||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates_batch(
|
||||
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
||||
Return candidate entities for the given mentions and fetching appropriate entries
|
||||
from the index.
|
||||
kb (KnowledgeBase): Knowledge base to query.
|
||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
|
|
|
@ -2,8 +2,10 @@
|
|||
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdint cimport int64_t
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
cdef Pool mem
|
||||
cdef readonly Vocab vocab
|
||||
|
|
|
@ -2,17 +2,19 @@
|
|||
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Tuple, Union
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from .candidate import Candidate
|
||||
from ..errors import Errors
|
||||
from ..tokens import Span
|
||||
from ..util import SimpleFrozenList
|
||||
from ..errors import Errors
|
||||
from .candidate import Candidate
|
||||
|
||||
|
||||
cdef class KnowledgeBase:
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
||||
their textual aliases, to support entity linking of named entities to
|
||||
real-world concepts.
|
||||
This is an abstract class and requires its operations to be implemented.
|
||||
|
||||
DOCS: https://spacy.io/api/kb
|
||||
|
@ -30,10 +32,13 @@ cdef class KnowledgeBase:
|
|||
self.entity_vector_length = entity_vector_length
|
||||
self.mem = Pool()
|
||||
|
||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
||||
def get_candidates_batch(
|
||||
self, mentions: Iterable[Span]
|
||||
) -> Iterable[Iterable[Candidate]]:
|
||||
"""
|
||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
Return candidate entities for specified texts. Each candidate defines
|
||||
the entity, the original alias, and the prior probability of that
|
||||
alias resolving to that entity.
|
||||
If no candidate is found for a given text, an empty list is returned.
|
||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||
|
@ -42,14 +47,17 @@ cdef class KnowledgeBase:
|
|||
|
||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
||||
Return candidate entities for specified text. Each candidate defines
|
||||
the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
If the no candidate is found for a given text, an empty list is returned.
|
||||
mention (Span): Mention for which to get candidates.
|
||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||
|
@ -67,7 +75,9 @@ cdef class KnowledgeBase:
|
|||
RETURNS (Iterable[float]): Vector for specified entity.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
|
@ -75,7 +85,9 @@ cdef class KnowledgeBase:
|
|||
RETURNS (bytes): Current state as binary string.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||
|
@ -84,25 +96,35 @@ cdef class KnowledgeBase:
|
|||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""
|
||||
Write KnowledgeBase content to disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""
|
||||
Load KnowledgeBase content from disk.
|
||||
path (Union[str, Path]): Target file path.
|
||||
exclude (Iterable[str]): List of components to exclude.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
||||
Errors.E1045.format(
|
||||
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||
)
|
||||
)
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
"""Knowledge-base for entity or concept linking."""
|
||||
from preshed.maps cimport PreshMap
|
||||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport FILE
|
||||
from libcpp.vector cimport vector
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from ..structs cimport AliasC, KBEntryC
|
||||
from ..typedefs cimport hash_t
|
||||
from ..structs cimport KBEntryC, AliasC
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
ctypedef vector[KBEntryC] entry_vec
|
||||
|
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# optional data, we can let users configure a DB as the backend for this.
|
||||
cdef object _features_table
|
||||
|
||||
|
||||
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
||||
"""Add an entity vector to the vectors table."""
|
||||
cdef int64_t new_index = self._vectors_table.size()
|
||||
self._vectors_table.push_back(entity_vector)
|
||||
return new_index
|
||||
|
||||
|
||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
||||
int32_t vector_index, int feats_row) nogil:
|
||||
cdef inline int64_t c_add_entity(
|
||||
self,
|
||||
hash_t entity_hash,
|
||||
float freq,
|
||||
int32_t vector_index,
|
||||
int feats_row
|
||||
) nogil:
|
||||
"""Add an entry to the vector of entries.
|
||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
||||
After calling this method, make sure to update also the _entry_index
|
||||
using the return value"""
|
||||
# This is what we'll map the entity hash key to. It's where the entry will sit
|
||||
# in the vector of entries, so we can get it later.
|
||||
cdef int64_t new_index = self._entries.size()
|
||||
|
||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
||||
# Avoid struct initializer to enable nogil, cf.
|
||||
# https://github.com/cython/cython/issues/1642
|
||||
cdef KBEntryC entry
|
||||
entry.entity_hash = entity_hash
|
||||
entry.vector_index = vector_index
|
||||
|
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._entries.push_back(entry)
|
||||
return new_index
|
||||
|
||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
|
||||
"""Connect a mention to a list of potential entities with their prior probabilities .
|
||||
After calling this method, make sure to update also the _alias_index using the return value"""
|
||||
# This is what we'll map the alias hash key to. It's where the alias will be defined
|
||||
# in the vector of aliases.
|
||||
cdef inline int64_t c_add_aliases(
|
||||
self,
|
||||
hash_t alias_hash,
|
||||
vector[int64_t] entry_indices,
|
||||
vector[float] probs
|
||||
) nogil:
|
||||
"""Connect a mention to a list of potential entities with their prior
|
||||
probabilities. After calling this method, make sure to update also the
|
||||
_alias_index using the return value"""
|
||||
# This is what we'll map the alias hash key to. It's where the alias will be
|
||||
# defined in the vector of aliases.
|
||||
cdef int64_t new_index = self._aliases_table.size()
|
||||
|
||||
# Avoid struct initializer to enable nogil
|
||||
|
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
||||
"""
|
||||
Initializing the vectors and making sure the first element of each vector is a dummy,
|
||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
||||
Initializing the vectors and making sure the first element of each vector is a
|
||||
dummy, because the PreshMap maps pointing to indices in these vectors can not
|
||||
contain 0 as value.
|
||||
cf. https://github.com/explosion/preshed/issues/17
|
||||
"""
|
||||
cdef int32_t dummy_value = 0
|
||||
|
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
cdef class Writer:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
||||
cdef int write_header(
|
||||
self, int64_t nr_entries, int64_t entity_vector_length
|
||||
) except -1
|
||||
cdef int write_vector_element(self, float element) except -1
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
||||
cdef int write_entry(
|
||||
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||
) except -1
|
||||
|
||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
||||
cdef int write_alias_header(
|
||||
self, hash_t alias_hash, int64_t candidate_length
|
||||
) except -1
|
||||
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
||||
|
||||
cdef int _write(self, void* value, size_t size) except -1
|
||||
|
@ -143,12 +161,18 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
cdef FILE* _fp
|
||||
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
||||
cdef int read_header(
|
||||
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||
) except -1
|
||||
cdef int read_vector_element(self, float* element) except -1
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
||||
cdef int read_entry(
|
||||
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||
) except -1
|
||||
|
||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
||||
cdef int read_alias_header(
|
||||
self, hash_t* alias_hash, int64_t* candidate_length
|
||||
) except -1
|
||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||
|
||||
cdef int _read(self, void* value, size_t size) except -1
|
||||
|
|
|
@ -1,29 +1,35 @@
|
|||
# cython: infer_types=True, profile=True
|
||||
from typing import Iterable, Callable, Dict, Any, Union
|
||||
from typing import Any, Callable, Dict, Iterable
|
||||
|
||||
import srsly
|
||||
from preshed.maps cimport PreshMap
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from pathlib import Path
|
||||
from cpython.exc cimport PyErr_SetFromErrno
|
||||
from libc.stdint cimport int32_t, int64_t
|
||||
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
|
||||
from libcpp.vector cimport vector
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
from ..tokens import Span
|
||||
|
||||
from ..typedefs cimport hash_t
|
||||
from ..errors import Errors, Warnings
|
||||
|
||||
from .. import util
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList, ensure_path
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
from .kb cimport KnowledgeBase
|
||||
|
||||
from .candidate import Candidate as Candidate
|
||||
|
||||
|
||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
||||
to support entity linking of named entities to real-world concepts.
|
||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities
|
||||
and their textual aliases, to support entity linking of named entities to
|
||||
real-world concepts.
|
||||
|
||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||
"""
|
||||
|
@ -66,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||
"""
|
||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
||||
Add an entity to the KB, optionally specifying its log probability
|
||||
based on corpus frequency.
|
||||
Return the hash of the entity ID/name at the end.
|
||||
"""
|
||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||
|
@ -78,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
# Raise an error if the provided entity vector is not of the correct length
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
raise ValueError(
|
||||
Errors.E141.format(
|
||||
found=len(entity_vector), required=self.entity_vector_length
|
||||
)
|
||||
)
|
||||
|
||||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||
|
||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1) # Features table currently not implemented
|
||||
new_index = self.c_add_entity(
|
||||
entity_hash=entity_hash,
|
||||
freq=freq,
|
||||
vector_index=vector_index,
|
||||
feats_row=-1
|
||||
) # Features table currently not implemented
|
||||
self._entry_index[entity_hash] = new_index
|
||||
|
||||
return entity_hash
|
||||
|
@ -110,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
else:
|
||||
entity_vector = vector_list[i]
|
||||
if len(entity_vector) != self.entity_vector_length:
|
||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
||||
raise ValueError(
|
||||
Errors.E141.format(
|
||||
found=len(entity_vector),
|
||||
required=self.entity_vector_length
|
||||
)
|
||||
)
|
||||
|
||||
entry.entity_hash = entity_hash
|
||||
entry.freq = freq_list[i]
|
||||
|
@ -144,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
previous_alias_nr = self.get_size_aliases()
|
||||
# Throw an error if the length of entities and probabilities are not the same
|
||||
if not len(entities) == len(probabilities):
|
||||
raise ValueError(Errors.E132.format(alias=alias,
|
||||
entities_length=len(entities),
|
||||
probabilities_length=len(probabilities)))
|
||||
raise ValueError(
|
||||
Errors.E132.format(
|
||||
alias=alias,
|
||||
entities_length=len(entities),
|
||||
probabilities_length=len(probabilities))
|
||||
)
|
||||
|
||||
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
|
||||
# Throw an error if the probabilities sum up to more than 1 (allow for
|
||||
# some rounding errors)
|
||||
prob_sum = sum(probabilities)
|
||||
if prob_sum > 1.00001:
|
||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||
|
@ -165,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
for entity, prob in zip(entities, probabilities):
|
||||
entity_hash = self.vocab.strings[entity]
|
||||
if not entity_hash in self._entry_index:
|
||||
if entity_hash not in self._entry_index:
|
||||
raise ValueError(Errors.E134.format(entity=entity))
|
||||
|
||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||
entry_indices.push_back(int(entry_index))
|
||||
probs.push_back(float(prob))
|
||||
|
||||
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
|
||||
new_index = self.c_add_aliases(
|
||||
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
|
||||
)
|
||||
self._alias_index[alias_hash] = new_index
|
||||
|
||||
if previous_alias_nr + 1 != self.get_size_aliases():
|
||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||
return alias_hash
|
||||
|
||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
||||
def append_alias(
|
||||
self, str alias, str entity, float prior_prob, ignore_warnings=False
|
||||
):
|
||||
"""
|
||||
For an alias already existing in the KB, extend its potential entities with one more.
|
||||
For an alias already existing in the KB, extend its potential entities
|
||||
with one more.
|
||||
Throw a warning if either the alias or the entity is unknown,
|
||||
or when the combination is already previously recorded.
|
||||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
||||
For efficiency, it's best to use the method `add_alias` as much as
|
||||
possible instead of this one.
|
||||
"""
|
||||
# Check if the alias exists in the KB
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if not alias_hash in self._alias_index:
|
||||
if alias_hash not in self._alias_index:
|
||||
raise ValueError(Errors.E176.format(alias=alias))
|
||||
|
||||
# Check if the entity exists in the KB
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
if not entity_hash in self._entry_index:
|
||||
if entity_hash not in self._entry_index:
|
||||
raise ValueError(Errors.E134.format(entity=entity))
|
||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||
|
||||
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
|
||||
# Throw an error if the prior probabilities (including the new one)
|
||||
# sum up to more than 1
|
||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
current_sum = sum([p for p in alias_entry.probs])
|
||||
|
@ -231,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
|
||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||
"""
|
||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
||||
and the prior probability of that alias resolving to that entity.
|
||||
Return candidate entities for an alias. Each candidate defines the
|
||||
entity, the original alias, and the prior probability of that alias
|
||||
resolving to that entity.
|
||||
If the alias is not known in the KB, and empty list is returned.
|
||||
"""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
if not alias_hash in self._alias_index:
|
||||
if alias_hash not in self._alias_index:
|
||||
return []
|
||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
|
@ -244,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
return [Candidate(kb=self,
|
||||
entity_hash=self._entries[entry_index].entity_hash,
|
||||
entity_freq=self._entries[entry_index].freq,
|
||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
||||
entity_vector=self._vectors_table[
|
||||
self._entries[entry_index].vector_index
|
||||
],
|
||||
alias_hash=alias_hash,
|
||||
prior_prob=prior_prob)
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
)
|
||||
if entry_index != 0]
|
||||
|
||||
def get_vector(self, str entity):
|
||||
|
@ -261,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||
|
||||
def get_prior_prob(self, str entity, str alias):
|
||||
""" Return the prior probability of a given alias being linked to a given entity,
|
||||
or return 0.0 when this combination is not known in the knowledge base"""
|
||||
""" Return the prior probability of a given alias being linked to a
|
||||
given entity, or return 0.0 when this combination is not known in the
|
||||
knowledge base."""
|
||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||
|
||||
|
@ -273,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
entry_index = self._entry_index[entity_hash]
|
||||
|
||||
alias_entry = self._aliases_table[alias_index]
|
||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
|
||||
for (entry_index, prior_prob) in zip(
|
||||
alias_entry.entry_indices, alias_entry.probs
|
||||
):
|
||||
if self._entries[entry_index].entity_hash == entity_hash:
|
||||
return prior_prob
|
||||
|
||||
|
@ -283,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
"""Serialize the current state to a binary string.
|
||||
"""
|
||||
def serialize_header():
|
||||
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
||||
header = (
|
||||
self.get_size_entities(),
|
||||
self.get_size_aliases(),
|
||||
self.entity_vector_length
|
||||
)
|
||||
return srsly.json_dumps(header)
|
||||
|
||||
def serialize_entries():
|
||||
i = 1
|
||||
tuples = []
|
||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||
for entry_hash, entry_index in sorted(
|
||||
self._entry_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
|
@ -302,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
headers = []
|
||||
indices_lists = []
|
||||
probs_lists = []
|
||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||
for alias_hash, alias_index in sorted(
|
||||
self._alias_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
alias = self._aliases_table[alias_index]
|
||||
assert alias_index == i
|
||||
candidate_length = len(alias.entry_indices)
|
||||
|
@ -360,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
indices = srsly.json_loads(all_data[1])
|
||||
probs = srsly.json_loads(all_data[2])
|
||||
for header, indices, probs in zip(headers, indices, probs):
|
||||
alias_hash, candidate_length = header
|
||||
alias_hash, _candidate_length = header
|
||||
alias.entry_indices = indices
|
||||
alias.probs = probs
|
||||
self._aliases_table[i] = alias
|
||||
|
@ -409,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
writer.write_vector_element(element)
|
||||
i = i+1
|
||||
|
||||
# dumping the entry records in the order in which they are in the _entries vector.
|
||||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
||||
# dumping the entry records in the order in which they are in the
|
||||
# _entries vector.
|
||||
# index 0 is a dummy object not stored in the _entry_index and can
|
||||
# be ignored.
|
||||
i = 1
|
||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
||||
for entry_hash, entry_index in sorted(
|
||||
self._entry_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
entry = self._entries[entry_index]
|
||||
assert entry.entity_hash == entry_hash
|
||||
assert entry_index == i
|
||||
|
@ -424,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
# dumping the aliases in the order in which they are in the _alias_index vector.
|
||||
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
||||
i = 1
|
||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
||||
for alias_hash, alias_index in sorted(
|
||||
self._alias_index.items(), key=lambda x: x[1]
|
||||
):
|
||||
alias = self._aliases_table[alias_index]
|
||||
assert alias_index == i
|
||||
|
||||
|
@ -530,7 +581,8 @@ cdef class Writer:
|
|||
def __init__(self, path):
|
||||
assert isinstance(path, Path)
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') \
|
||||
if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
raise IOError(Errors.E146.format(path=path))
|
||||
|
@ -540,14 +592,18 @@ cdef class Writer:
|
|||
cdef size_t status = fclose(self._fp)
|
||||
assert status == 0
|
||||
|
||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
|
||||
cdef int write_header(
|
||||
self, int64_t nr_entries, int64_t entity_vector_length
|
||||
) except -1:
|
||||
self._write(&nr_entries, sizeof(nr_entries))
|
||||
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
||||
|
||||
cdef int write_vector_element(self, float element) except -1:
|
||||
self._write(&element, sizeof(element))
|
||||
|
||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
||||
cdef int write_entry(
|
||||
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||
) except -1:
|
||||
self._write(&entry_hash, sizeof(entry_hash))
|
||||
self._write(&entry_freq, sizeof(entry_freq))
|
||||
self._write(&vector_index, sizeof(vector_index))
|
||||
|
@ -556,7 +612,9 @@ cdef class Writer:
|
|||
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
||||
self._write(&alias_length, sizeof(alias_length))
|
||||
|
||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
|
||||
cdef int write_alias_header(
|
||||
self, hash_t alias_hash, int64_t candidate_length
|
||||
) except -1:
|
||||
self._write(&alias_hash, sizeof(alias_hash))
|
||||
self._write(&candidate_length, sizeof(candidate_length))
|
||||
|
||||
|
@ -572,16 +630,19 @@ cdef class Writer:
|
|||
cdef class Reader:
|
||||
def __init__(self, path):
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
||||
cdef bytes bytes_loc = content.encode('utf8') \
|
||||
if type(content) == str else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||
|
||||
def __dealloc__(self):
|
||||
fclose(self._fp)
|
||||
|
||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
|
||||
cdef int read_header(
|
||||
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||
) except -1:
|
||||
status = self._read(nr_entries, sizeof(int64_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
@ -601,7 +662,9 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError(Errors.E145.format(param="vector element"))
|
||||
|
||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
||||
cdef int read_entry(
|
||||
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||
) except -1:
|
||||
status = self._read(entity_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
@ -632,7 +695,9 @@ cdef class Reader:
|
|||
return 0 # end of file
|
||||
raise IOError(Errors.E145.format(param="alias length"))
|
||||
|
||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
|
||||
cdef int read_alias_header(
|
||||
self, hash_t* alias_hash, int64_t* candidate_length
|
||||
) except -1:
|
||||
status = self._read(alias_hash, sizeof(hash_t))
|
||||
if status < 1:
|
||||
if feof(self._fp):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class AfrikaansDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from ...attrs import LANG
|
||||
from ...language import BaseDefaults, Language
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class AmharicDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
from ..char_classes import (
|
||||
ALPHA_UPPER,
|
||||
CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
UNITS,
|
||||
)
|
||||
|
||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from ...symbols import ORTH, NORM
|
||||
|
||||
from ...symbols import NORM, ORTH
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class ArabicDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
||||
from ..char_classes import UNITS, ALPHA_UPPER
|
||||
from ..char_classes import (
|
||||
ALPHA_UPPER,
|
||||
CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
UNITS,
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language, BaseDefaults
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class AzerbaijaniDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# Eleven, twelve etc. are written separate: on bir, on iki
|
||||
|
||||
_num_words = [
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
from ...attrs import LANG
|
||||
from ...language import BaseDefaults, Language
|
||||
from ...util import update_exc
|
||||
from ..punctuation import (
|
||||
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
|
||||
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
|
||||
)
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
|
||||
class BulgarianDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"нула",
|
||||
"едно",
|
||||
|
|
|
@ -4,8 +4,7 @@ References:
|
|||
(countries, occupations, fields of studies and more).
|
||||
"""
|
||||
|
||||
from ...symbols import ORTH, NORM
|
||||
|
||||
from ...symbols import NORM, ORTH
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
from typing import Optional, Callable
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from ...pipeline import Lemmatizer
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class BengaliDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,6 +1,14 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
||||
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
UNITS,
|
||||
)
|
||||
|
||||
_currency = r"\$¢£€¥฿৳"
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
from typing import Optional, Callable
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import CatalanLemmatizer
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class CatalanDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"zero",
|
||||
"un",
|
||||
|
|
|
@ -1,9 +1,18 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
||||
from ..char_classes import LIST_CURRENCY
|
||||
from ..char_classes import CURRENCY
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
from ..char_classes import merge_chars, _units
|
||||
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
CURRENCY,
|
||||
LIST_CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
PUNCT,
|
||||
_units,
|
||||
merge_chars,
|
||||
)
|
||||
|
||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from ...tokens import Doc, Span
|
||||
from ...symbols import NOUN, PROPN
|
||||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language, BaseDefaults
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class CzechDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class DanishDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
# Source http://fjern-uv.dk/tal.php
|
||||
_num_words = """nul
|
||||
en et to tre fire fem seks syv otte ni ti
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
)
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from ...tokens import Doc, Span
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...errors import Errors
|
||||
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
|
|
|
@ -2,10 +2,9 @@
|
|||
Tokenizer Exceptions.
|
||||
Source: https://forkortelse.dk/ and various others.
|
||||
"""
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ...language import BaseDefaults, Language
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class GermanDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,9 +1,18 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
PUNCT,
|
||||
UNITS,
|
||||
)
|
||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||
|
||||
|
||||
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
||||
|
||||
_suffixes = (
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PRON, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {
|
||||
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class LowerSorbianDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from typing import Optional, Callable
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from ...language import Language, BaseDefaults
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class GreekDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
def get_pos_from_wiktionary():
|
||||
import re
|
||||
|
||||
from gensim.corpora.wikicorpus import extract_pages
|
||||
|
||||
regex = re.compile(r"==={{(\w+)\|el}}===")
|
||||
|
|
|
@ -1,6 +1,16 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
CURRENCY,
|
||||
HYPHENS,
|
||||
LIST_CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
)
|
||||
|
||||
_units = (
|
||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PRON, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {}
|
||||
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
from typing import Optional, Callable
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import EnglishLemmatizer
|
||||
from ...language import Language, BaseDefaults
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class EnglishDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
HYPHENS,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
)
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PRON, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
from typing import Dict, List
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...util import update_exc
|
||||
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc: Dict[str, List[Dict]] = {}
|
||||
_exclude = [
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
from typing import Optional, Callable
|
||||
from typing import Callable, Optional
|
||||
|
||||
from thinc.api import Model
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lemmatizer import SpanishLemmatizer
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class SpanishDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import List, Optional, Tuple
|
||||
import re
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from ...pipeline import Lemmatizer
|
||||
from ...tokens import Token
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"cero",
|
||||
"uno",
|
||||
|
|
|
@ -1,8 +1,17 @@
|
|||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
from ..char_classes import merge_chars
|
||||
|
||||
from ..char_classes import (
|
||||
ALPHA,
|
||||
ALPHA_LOWER,
|
||||
ALPHA_UPPER,
|
||||
CONCAT_QUOTES,
|
||||
CURRENCY,
|
||||
LIST_ELLIPSES,
|
||||
LIST_ICONS,
|
||||
LIST_PUNCT,
|
||||
LIST_QUOTES,
|
||||
LIST_UNITS,
|
||||
PUNCT,
|
||||
merge_chars,
|
||||
)
|
||||
|
||||
_list_units = [u for u in LIST_UNITS if u != "%"]
|
||||
_units = merge_chars(" ".join(_list_units))
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from typing import Iterator, Tuple, Union
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...symbols import NOUN, PRON, PROPN
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ...symbols import ORTH, NORM
|
||||
from ...symbols import NORM, ORTH
|
||||
from ...util import update_exc
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
|
||||
_exc = {
|
||||
"pal": [{ORTH: "pa"}, {ORTH: "l", NORM: "el"}],
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class EstonianDefaults(BaseDefaults):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class BasqueDefaults(BaseDefaults):
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user