Merge branch 'develop' into feature/pymorphy-lemmatizer-diacritics

This commit is contained in:
Adriane Boyd 2023-08-01 16:16:53 +02:00 committed by GitHub
commit 135a28a89d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
596 changed files with 10498 additions and 5162 deletions

View File

@ -1,118 +0,0 @@
parameters:
python_version: ''
architecture: 'x64'
num_build_jobs: 2
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: ${{ parameters.python_version }}
architecture: ${{ parameters.architecture }}
allowUnstable: true
- bash: |
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
displayName: 'Set variables'
- script: |
python -m pip install -U build pip setuptools
python -m pip install -U -r requirements.txt
displayName: "Install dependencies"
- script: |
python -m build --sdist
displayName: "Build sdist"
- script: |
python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.6')
- task: DeleteFiles@1
inputs:
contents: "spacy"
displayName: "Delete source directory"
- task: DeleteFiles@1
inputs:
contents: "*.egg-info"
displayName: "Delete egg-info directory"
- script: |
python -m pip freeze > installed.txt
python -m pip uninstall -y -r installed.txt
displayName: "Uninstall all packages"
- bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
displayName: "Install from sdist"
- script: |
python -W error -c "import spacy"
displayName: "Test import"
- script: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
python -W error -m spacy info ca_core_news_sm | grep -q download_url
displayName: 'Test download_url in info CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
displayName: 'Test no warnings on load (#11713)'
condition: eq(variables['python_version'], '3.9')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
displayName: 'Test convert CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
python -m spacy init config -p ner -l ca ner.cfg
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
displayName: 'Test debug config CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
# will have errors due to sparse data, check for summary in output
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
displayName: 'Test debug data CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.9')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.9')
- script: |
python -m pip install -U -r requirements.txt
displayName: "Install test requirements"
- script: |
python -m pytest --pyargs spacy -W error
displayName: "Run CPU tests"
- script: |
python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))

View File

@ -37,10 +37,20 @@ jobs:
run: | run: |
python -m pip install black -c requirements.txt python -m pip install black -c requirements.txt
python -m black spacy --check python -m black spacy --check
- name: isort
run: |
python -m pip install isort -c requirements.txt
python -m isort spacy --check
- name: flake8 - name: flake8
run: | run: |
python -m pip install flake8==5.0.4 python -m pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
- name: cython-lint
run: |
python -m pip install cython-lint -c requirements.txt
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
cython-lint spacy --ignore E501,W291,E266
tests: tests:
name: Test name: Test
needs: Validate needs: Validate
@ -107,22 +117,22 @@ jobs:
- name: Test import - name: Test import
run: python -W error -c "import spacy" run: python -W error -c "import spacy"
- name: "Test download CLI" # - name: "Test download CLI"
run: | # run: |
python -m spacy download ca_core_news_sm # python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md # python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" # python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
#
- name: "Test download_url in info CLI" # - name: "Test download_url in info CLI"
run: | # run: |
python -W error -m spacy info ca_core_news_sm | grep -q download_url # python -W error -m spacy info ca_core_news_sm | grep -q download_url
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
#
- name: "Test no warnings on load (#11713)" # - name: "Test no warnings on load (#11713)"
run: | # run: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" # python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
- name: "Test convert CLI" - name: "Test convert CLI"
run: | run: |
@ -146,17 +156,17 @@ jobs:
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
if: matrix.python_version == '3.9' if: matrix.python_version == '3.9'
- name: "Test assemble CLI" # - name: "Test assemble CLI"
run: | # run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" # python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir # PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
#
- name: "Test assemble CLI vectors warning" # - name: "Test assemble CLI vectors warning"
run: | # run: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" # python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 # python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
if: matrix.python_version == '3.9' # if: matrix.python_version == '3.9'
- name: "Install test requirements" - name: "Install test requirements"
run: | run: |
@ -165,6 +175,7 @@ jobs:
- name: "Run CPU tests" - name: "Run CPU tests"
run: | run: |
python -m pytest --pyargs spacy -W error python -m pytest --pyargs spacy -W error
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
- name: "Run CPU tests with thinc-apple-ops" - name: "Run CPU tests with thinc-apple-ops"
run: | run: |

View File

@ -1,11 +1,11 @@
SHELL := /bin/bash SHELL := /bin/bash
ifndef SPACY_EXTRAS ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2 override SPACY_EXTRAS = spacy-lookups-data==1.0.3
endif endif
ifndef PYVER ifndef PYVER
override PYVER = 3.6 override PYVER = 3.8
endif endif
VENV := ./env$(PYVER) VENV := ./env$(PYVER)

View File

@ -36,7 +36,7 @@ open-source software, released under the [MIT license](https://github.com/explos
## 📖 Documentation ## 📖 Documentation
| Documentation | | | Documentation | |
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------- | ---------------------------------------------------------------------- |
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! | | ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
| 📚 **[Usage Guides]** | How to use spaCy and its features. | | 📚 **[Usage Guides]** | How to use spaCy and its features. |
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. | | 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
@ -44,6 +44,7 @@ open-source software, released under the [MIT license](https://github.com/explos
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. | | 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
| 📦 **[Models]** | Download trained pipelines for spaCy. | | 📦 **[Models]** | Download trained pipelines for spaCy. |
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. | | 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
| 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. | | 👩‍🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. | | 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
| 🛠 **[Changelog]** | Changes and version history. | | 🛠 **[Changelog]** | Changes and version history. |
@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
[api reference]: https://spacy.io/api/ [api reference]: https://spacy.io/api/
[models]: https://spacy.io/models [models]: https://spacy.io/models
[universe]: https://spacy.io/universe [universe]: https://spacy.io/universe
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
[videos]: https://www.youtube.com/c/ExplosionAI [videos]: https://www.youtube.com/c/ExplosionAI
[online course]: https://course.spacy.io [online course]: https://course.spacy.io
[project templates]: https://github.com/explosion/projects [project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
## 💬 Where to ask questions ## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about). The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).

View File

@ -1,120 +0,0 @@
trigger:
batch: true
branches:
include:
- "*"
exclude:
- "spacy.io"
- "nightly.spacy.io"
- "v2.spacy.io"
paths:
exclude:
- "website/*"
- "*.md"
- "*.mdx"
- ".github/workflows/*"
pr:
paths:
exclude:
- "*.md"
- "*.mdx"
- "website/docs/*"
- "website/src/*"
- "website/meta/*.tsx"
- "website/meta/*.mjs"
- "website/meta/languages.json"
- "website/meta/site.json"
- "website/meta/sidebars.json"
- "website/meta/type-annotations.json"
- "website/pages/*"
- ".github/workflows/*"
jobs:
# Check formatting and linting. Perform basic checks for most important errors
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
# selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-latest"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.7"
- script: |
pip install black -c requirements.txt
python -m black spacy --check
displayName: "black"
- script: |
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
displayName: "flake8"
- script: |
python .github/validate_universe_json.py website/meta/universe.json
displayName: 'Validate website/meta/universe.json'
- job: "Test"
dependsOn: "Validate"
strategy:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-20.04"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
# python.version: "3.6"
# Python36Mac:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-20.04"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
python.version: "3.7"
# Python37Mac:
# imageName: "macos-latest"
# python.version: "3.7"
# Python38Linux:
# imageName: "ubuntu-latest"
# python.version: "3.8"
# Python38Windows:
# imageName: "windows-latest"
# python.version: "3.8"
Python38Mac:
imageName: "macos-latest"
python.version: "3.8"
Python39Linux:
imageName: "ubuntu-latest"
python.version: "3.9"
# Python39Windows:
# imageName: "windows-latest"
# python.version: "3.9"
# Python39Mac:
# imageName: "macos-latest"
# python.version: "3.9"
# Python310Linux:
# imageName: "ubuntu-latest"
# python.version: "3.10"
Python310Windows:
imageName: "windows-latest"
python.version: "3.10"
# Python310Mac:
# imageName: "macos-latest"
# python.version: "3.10"
Python311Linux:
imageName: 'ubuntu-latest'
python.version: '3.11'
Python311Windows:
imageName: 'windows-latest'
python.version: '3.11'
Python311Mac:
imageName: 'macos-latest'
python.version: '3.11'
maxParallel: 4
pool:
vmImage: $(imageName)
steps:
- template: .github/azure-steps.yml
parameters:
python_version: '$(python.version)'

View File

@ -3,7 +3,4 @@ numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64' numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64' numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9' numpy>=1.25.0; python_version>='3.9'
numpy==1.21.3; python_version=='3.10'
numpy==1.23.2; python_version=='3.11'
numpy; python_version>='3.12'

View File

@ -1,14 +1,17 @@
# Listeners # Listeners
1. [Overview](#1-overview) - [1. Overview](#1-overview)
2. [Initialization](#2-initialization) - [2. Initialization](#2-initialization)
- [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component) - [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
- [B. Shape inference](#2b-shape-inference) - [2B. Shape inference](#2b-shape-inference)
3. [Internal communication](#3-internal-communication) - [3. Internal communication](#3-internal-communication)
- [A. During prediction](#3a-during-prediction) - [3A. During prediction](#3a-during-prediction)
- [B. During training](#3b-during-training) - [3B. During training](#3b-during-training)
- [C. Frozen components](#3c-frozen-components) - [Training with multiple listeners](#training-with-multiple-listeners)
4. [Replacing listener with standalone](#4-replacing-listener-with-standalone) - [3C. Frozen components](#3c-frozen-components)
- [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
- [The upstream component is frozen](#the-upstream-component-is-frozen)
- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
## 1. Overview ## 1. Overview
@ -218,3 +221,15 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
The new config and model are then properly stored on the `nlp` object. The new config and model are then properly stored on the `nlp` object.
Note that this functionality (running the replacement for a transformer listener) was broken prior to Note that this functionality (running the replacement for a transformer listener) was broken prior to
`spacy-transformers` 1.0.5. `spacy-transformers` 1.0.5.
In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
the method only passes these extra arguments for callbacks that support them:
```
def replace_listener_pre_37(copied_tok2vec_model):
...
def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
...
```

View File

@ -6,6 +6,10 @@ requires = [
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.1.8,<8.2.0", "thinc>=8.1.8,<8.2.0",
"numpy>=1.15.0", "numpy>=1.15.0; python_version < '3.9'",
"numpy>=1.25.0; python_version >= '3.9'",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[tool.isort]
profile = "black"

View File

@ -9,11 +9,13 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
weasel>=0.1.0,<0.2.0
# Third party dependencies # Third party dependencies
numpy>=1.15.0 numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
@ -38,3 +40,5 @@ types-setuptools>=57.0.0
types-requests types-requests
types-setuptools>=57.0.0 types-setuptools>=57.0.0
black==22.3.0 black==22.3.0
cython-lint>=0.15.0; python_version >= "3.7"
isort>=5.0,<6.0

View File

@ -32,8 +32,13 @@ project_urls =
zip_safe = false zip_safe = false
include_package_data = true include_package_data = true
python_requires = >=3.6 python_requires = >=3.6
# NOTE: This section is superseded by pyproject.toml and will be removed in
# spaCy v4
setup_requires = setup_requires =
cython>=0.25,<3.0 cython>=0.25,<3.0
# The newest supported pip for python 3.6 has bugs related to markers in
# this section, so this does not contain the same constraints as
# pyproject.toml
numpy>=1.15.0 numpy>=1.15.0
# We also need our Cython packages here to compile against # We also need our Cython packages here to compile against
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
@ -51,12 +56,14 @@ install_requires =
wasabi>=0.9.1,<1.2.0 wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0 srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0 catalogue>=2.0.6,<2.1.0
weasel>=0.1.0,<0.2.0
# Third-party dependencies # Third-party dependencies
typer>=0.3.0,<0.8.0 typer>=0.3.0,<0.10.0
pathy>=0.10.0 pathy>=0.10.0
smart-open>=5.2.1,<7.0.0 smart-open>=5.2.1,<7.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9"
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
jinja2 jinja2
@ -75,8 +82,6 @@ lookups =
spacy_lookups_data>=1.0.3,<1.1.0 spacy_lookups_data>=1.0.3,<1.1.0
transformers = transformers =
spacy_transformers>=1.1.2,<1.3.0 spacy_transformers>=1.1.2,<1.3.0
ray =
spacy_ray>=0.1.0,<1.0.0
cuda = cuda =
cupy>=5.0.0b4,<13.0.0 cupy>=5.0.0b4,<13.0.0
cuda80 = cuda80 =

View File

@ -1,6 +1,6 @@
from typing import Union, Iterable, Dict, Any
from pathlib import Path
import sys import sys
from pathlib import Path
from typing import Any, Dict, Iterable, Union
# set library-specific custom warning handling before doing anything else # set library-specific custom warning handling before doing anything else
from .errors import setup_default_warnings from .errors import setup_default_warnings
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
setup_default_warnings() # noqa: E402 setup_default_warnings() # noqa: E402
# These are imported as part of the API # These are imported as part of the API
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401 from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
from thinc.api import Config
from . import pipeline # noqa: F401 from . import pipeline # noqa: F401
from .cli.info import info # noqa: F401
from .glossary import explain # noqa: F401
from .about import __version__ # noqa: F401
from .util import registry, logger # noqa: F401
from .errors import Errors
from .language import Language
from .vocab import Vocab
from . import util from . import util
from .about import __version__ # noqa: F401
from .cli.info import info # noqa: F401
from .errors import Errors
from .glossary import explain # noqa: F401
from .language import Language
from .util import logger, registry # noqa: F401
from .vocab import Vocab
if sys.maxunicode == 65535: if sys.maxunicode == 65535:
raise SystemError(Errors.E130) raise SystemError(Errors.E130)

View File

@ -1,7 +1,5 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.5.0" __version__ = "3.7.0.dev0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"
__projects_branch__ = "v3"

View File

@ -1,6 +1,7 @@
# Reserve 64 values for flag features # Reserve 64 values for flag features
from . cimport symbols from . cimport symbols
cdef enum attr_id_t: cdef enum attr_id_t:
NULL_ATTR NULL_ATTR
IS_ALPHA IS_ALPHA

View File

@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
if "pos" in stringy_attrs: if "pos" in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos") stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if "morph" in stringy_attrs: if "morph" in stringy_attrs:
morphs = stringy_attrs.pop("morph") morphs = stringy_attrs.pop("morph") # no-cython-lint
if "number" in stringy_attrs: if "number" in stringy_attrs:
stringy_attrs.pop("number") stringy_attrs.pop("number")
if "tenspect" in stringy_attrs: if "tenspect" in stringy_attrs:

View File

@ -1,35 +1,28 @@
from wasabi import msg from wasabi import msg
from ._util import app, setup_cli # noqa: F401 from ._util import app, setup_cli # noqa: F401
from .apply import apply # noqa: F401
from .assemble import assemble_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands # These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here. # are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401 from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .convert import convert # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .debug_model import debug_model # noqa: F401
from .download import download # noqa: F401 from .download import download # noqa: F401
from .evaluate import evaluate # noqa: F401
from .find_threshold import find_threshold # noqa: F401
from .info import info # noqa: F401 from .info import info # noqa: F401
from .init_config import fill_config, init_config # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .package import package # noqa: F401 from .package import package # noqa: F401
from .pretrain import pretrain # noqa: F401
from .profile import profile # noqa: F401 from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401 from .train import train_cli # noqa: F401
from .assemble import assemble_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
from .validate import validate # noqa: F401 from .validate import validate # noqa: F401
from .project.clone import project_clone # noqa: F401
from .project.assets import project_assets # noqa: F401
from .project.run import project_run # noqa: F401
from .project.dvc import project_update_dvc # noqa: F401
from .project.push import project_push # noqa: F401
from .project.pull import project_pull # noqa: F401
from .project.document import project_document # noqa: F401
from .find_threshold import find_threshold # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True) @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)

View File

@ -1,26 +1,45 @@
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
from typing import TYPE_CHECKING, overload
import sys
import shutil
from pathlib import Path
from wasabi import msg, Printer
import srsly
import hashlib import hashlib
import os
import shutil
import sys
from configparser import InterpolationError
from contextlib import contextmanager
from pathlib import Path
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
List,
Optional,
Tuple,
Union,
overload,
)
import srsly
import typer import typer
from click import NoSuchOption from click import NoSuchOption
from click.parser import split_arg_string from click.parser import split_arg_string
from typer.main import get_command
from contextlib import contextmanager
from thinc.api import Config, ConfigValidationError, require_gpu from thinc.api import Config, ConfigValidationError, require_gpu
from thinc.util import gpu_is_available from thinc.util import gpu_is_available
from configparser import InterpolationError from typer.main import get_command
import os from wasabi import Printer, msg
from weasel import app as project_cli
from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from .. import about from .. import about
from ..compat import Literal
from ..schemas import validate
from ..util import (
ENV_VARS,
SimpleFrozenDict,
import_file,
is_compatible_version,
logger,
make_tempdir,
registry,
run_command,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401 from pathy import FluidPath # noqa: F401
@ -30,7 +49,6 @@ SDIST_SUFFIX = ".tar.gz"
WHEEL_SUFFIX = "-py3-none-any.whl" WHEEL_SUFFIX = "-py3-none-any.whl"
PROJECT_FILE = "project.yml" PROJECT_FILE = "project.yml"
PROJECT_LOCK = "project.lock"
COMMAND = "python -m spacy" COMMAND = "python -m spacy"
NAME = "spacy" NAME = "spacy"
HELP = """spaCy Command-line Interface HELP = """spaCy Command-line Interface
@ -56,11 +74,10 @@ Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP) app = typer.Typer(name=NAME, help=HELP)
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True) benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli) app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
app.add_typer(debug_cli) app.add_typer(debug_cli)
app.add_typer(benchmark_cli) app.add_typer(benchmark_cli)
app.add_typer(init_cli) app.add_typer(init_cli)
@ -135,148 +152,6 @@ def _parse_override(value: Any) -> Any:
return str(value) return str(value)
def load_project_config(
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
) -> Dict[str, Any]:
"""Load the project.yml file from a directory and validate it. Also make
sure that all directories defined in the config exist.
path (Path): The path to the project directory.
interpolate (bool): Whether to substitute project variables.
overrides (Dict[str, Any]): Optional config overrides.
RETURNS (Dict[str, Any]): The loaded project.yml.
"""
config_path = path / PROJECT_FILE
if not config_path.exists():
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
try:
config = srsly.read_yaml(config_path)
except ValueError as e:
msg.fail(invalid_err, e, exits=1)
errors = validate(ProjectConfigSchema, config)
if errors:
msg.fail(invalid_err)
print("\n".join(errors))
sys.exit(1)
validate_project_version(config)
validate_project_commands(config)
if interpolate:
err = f"{PROJECT_FILE} validation error"
with show_validation_error(title=err, hint_fill=False):
config = substitute_project_variables(config, overrides)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
return config
def substitute_project_variables(
config: Dict[str, Any],
overrides: Dict[str, Any] = SimpleFrozenDict(),
key: str = "vars",
env_key: str = "env",
) -> Dict[str, Any]:
"""Interpolate variables in the project file using the config system.
config (Dict[str, Any]): The project config.
overrides (Dict[str, Any]): Optional config overrides.
key (str): Key containing variables in project config.
env_key (str): Key containing environment variable mapping in project config.
RETURNS (Dict[str, Any]): The interpolated project config.
"""
config.setdefault(key, {})
config.setdefault(env_key, {})
# Substitute references to env vars with their values
for config_var, env_var in config[env_key].items():
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
# Need to put variables in the top scope again so we can have a top-level
# section "project" (otherwise, a list of commands in the top scope wouldn't)
# be allowed by Thinc's config system
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
interpolated = cfg.interpolate()
return dict(interpolated["project"])
def validate_project_version(config: Dict[str, Any]) -> None:
"""If the project defines a compatible spaCy version range, chec that it's
compatible with the current version of spaCy.
config (Dict[str, Any]): The loaded config.
"""
spacy_version = config.get("spacy_version", None)
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
err = (
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
f"that's not compatible with the version of spaCy you're running "
f"({about.__version__}). You can edit version requirement in the "
f"{PROJECT_FILE} to load it, but the project may not run as expected."
)
msg.fail(err, exits=1)
def validate_project_commands(config: Dict[str, Any]) -> None:
"""Check that project commands and workflows are valid, don't contain
duplicates, don't clash and only refer to commands that exist.
config (Dict[str, Any]): The loaded config.
"""
command_names = [cmd["name"] for cmd in config.get("commands", [])]
workflows = config.get("workflows", {})
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
if duplicates:
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
msg.fail(err, exits=1)
for workflow_name, workflow_steps in workflows.items():
if workflow_name in command_names:
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
msg.fail(err, exits=1)
for step in workflow_steps:
if step not in command_names:
msg.fail(
f"Unknown command specified in workflow '{workflow_name}': {step}",
f"Workflows can only refer to commands defined in the 'commands' "
f"section of the {PROJECT_FILE}.",
exits=1,
)
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
"""Get the hash for a JSON-serializable object.
data: The data to hash.
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
RETURNS (str): The hash.
"""
if isinstance(data, dict):
data = {k: v for k, v in data.items() if k not in exclude}
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
return hashlib.md5(data_str).hexdigest()
def get_checksum(path: Union[Path, str]) -> str:
"""Get the checksum for a file or directory given its file path. If a
directory path is provided, this uses all files in that directory.
path (Union[Path, str]): The file or directory path.
RETURNS (str): The checksum.
"""
path = Path(path)
if not (path.is_file() or path.is_dir()):
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
if path.is_file():
return hashlib.md5(Path(path).read_bytes()).hexdigest()
else:
# TODO: this is currently pretty slow
dir_checksum = hashlib.md5()
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
dir_checksum.update(sub_file.read_bytes())
return dir_checksum.hexdigest()
@contextmanager @contextmanager
def show_validation_error( def show_validation_error(
file_path: Optional[Union[str, Path]] = None, file_path: Optional[Union[str, Path]] = None,
@ -334,166 +209,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1) msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
"""Upload a file.
src (Path): The source path.
url (str): The destination URL to upload to.
"""
import smart_open
# Create parent directories for local paths
if isinstance(dest, Path):
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
dest = str(dest)
with smart_open.open(dest, mode="wb") as output_file:
with src.open(mode="rb") as input_file:
output_file.write(input_file.read())
def download_file(
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
) -> None:
"""Download a file using smart_open.
url (str): The URL of the file.
dest (Path): The destination path.
force (bool): Whether to force download even if file exists.
If False, the download will be skipped.
"""
import smart_open
if dest.exists() and not force:
return None
src = str(src)
with smart_open.open(src, mode="rb", compression="disable") as input_file:
with dest.open(mode="wb") as output_file:
shutil.copyfileobj(input_file, output_file)
def ensure_pathy(path):
"""Temporary helper to prevent importing Pathy globally (which can cause
slow and annoying Google Cloud warning)."""
from pathy import Pathy # noqa: F811
return Pathy.fluid(path)
def git_checkout(
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
):
git_version = get_git_version()
if dest.exists():
msg.fail("Destination of checkout must not exist", exits=1)
if not dest.parent.exists():
msg.fail("Parent of destination of checkout must exist", exits=1)
if sparse and git_version >= (2, 22):
return git_sparse_checkout(repo, subpath, dest, branch)
elif sparse:
# Only show warnings if the user explicitly wants sparse checkout but
# the Git version doesn't support it
err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet."
)
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
msg.warn(
f"{err_unk if git_version == (0, 0) else err_old} "
f"This means that more files than necessary may be downloaded "
f"temporarily. To only download the files needed, make sure "
f"you're using Git v2.22 or above."
)
with make_tempdir() as tmp_dir:
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
try:
source_path = tmp_dir / Path(subpath)
if not is_subpath_of(tmp_dir, source_path):
err = f"'{subpath}' is a path outside of the cloned repository."
msg.fail(err, repo, exits=1)
shutil.copytree(str(source_path), str(dest))
except FileNotFoundError:
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
msg.fail(err, repo, exits=1)
def git_sparse_checkout(repo, subpath, dest, branch):
# We're using Git, partial clone and sparse checkout to
# only clone the files we need
# This ends up being RIDICULOUS. omg.
# So, every tutorial and SO post talks about 'sparse checkout'...But they
# go and *clone* the whole repo. Worthless. And cloning part of a repo
# turns out to be completely broken. The only way to specify a "path" is..
# a path *on the server*? The contents of which, specifies the paths. Wat.
# Obviously this is hopelessly broken and insecure, because you can query
# arbitrary paths on the server! So nobody enables this.
# What we have to do is disable *all* files. We could then just checkout
# the path, and it'd "work", but be hopelessly slow...Because it goes and
# transfers every missing object one-by-one. So the final piece is that we
# need to use some weird git internals to fetch the missings in bulk, and
# *that* we can do by path.
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
# This is the "clone, but don't download anything" part.
cmd = (
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
f"-b {branch} --filter=blob:none"
)
run_command(cmd)
# Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
ret = run_command(cmd, capture=True)
git_repo = _http_to_git(repo)
# Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
if not missings:
err = (
f"Could not find any relevant files for '{subpath}'. "
f"Did you specify a correct and complete path within repo '{repo}' "
f"and branch {branch}?"
)
msg.fail(err, exits=1)
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
run_command(cmd, capture=True)
# And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
run_command(cmd, capture=True)
# Get a subdirectory of the cloned path, if appropriate
source_path = tmp_dir / Path(subpath)
if not is_subpath_of(tmp_dir, source_path):
err = f"'{subpath}' is a path outside of the cloned repository."
msg.fail(err, repo, exits=1)
shutil.move(str(source_path), str(dest))
def git_repo_branch_exists(repo: str, branch: str) -> bool:
"""Uses 'git ls-remote' to check if a repository and branch exists
repo (str): URL to get repo.
branch (str): Branch on repo to check.
RETURNS (bool): True if repo:branch exists.
"""
get_git_version()
cmd = f"git ls-remote {repo} {branch}"
# We might be tempted to use `--exit-code` with `git ls-remote`, but
# `run_command` handles the `returncode` for us, so we'll rely on
# the fact that stdout returns '' if the requested branch doesn't exist
ret = run_command(cmd, capture=True)
exists = ret.stdout != ""
return exists
def get_git_version( def get_git_version(
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.", error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
) -> Tuple[int, int]: ) -> Tuple[int, int]:
"""Get the version of git and raise an error if calling 'git --version' fails. """Get the version of git and raise an error if calling 'git --version' fails.
error (str): The error message to show. error (str): The error message to show.
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined. (0, 0) if the version couldn't be determined.
@ -509,30 +228,6 @@ def get_git_version(
return int(version[0]), int(version[1]) return int(version[0]), int(version[1])
def _http_to_git(repo: str) -> str:
if repo.startswith("http://"):
repo = repo.replace(r"http://", r"https://")
if repo.startswith(r"https://"):
repo = repo.replace("https://", "git@").replace("/", ":", 1)
if repo.endswith("/"):
repo = repo[:-1]
repo = f"{repo}.git"
return repo
def is_subpath_of(parent, child):
"""
Check whether `child` is a path contained within `parent`.
"""
# Based on https://stackoverflow.com/a/37095733 .
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
# we can stop using crusty old os.path functions.
parent_realpath = os.path.realpath(parent)
child_realpath = os.path.realpath(child)
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
@overload @overload
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]: def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
... ...

View File

@ -1,18 +1,15 @@
import tqdm
import srsly
from itertools import chain from itertools import chain
from pathlib import Path from pathlib import Path
from typing import Optional, List, Iterable, cast, Union from typing import Iterable, List, Optional, Union, cast
import srsly
import tqdm
from wasabi import msg from wasabi import msg
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
from ..tokens import Doc, DocBin from ..tokens import Doc, DocBin
from ..vocab import Vocab
from ..util import ensure_path, load_model from ..util import ensure_path, load_model
from ..vocab import Vocab
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
path_help = """Location of the documents to predict on. path_help = """Location of the documents to predict on.
Can be a single file in .spacy format or a .jsonl file. Can be a single file in .spacy format or a .jsonl file.

View File

@ -1,13 +1,20 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import logging import logging
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code
from .. import util from .. import util
from ..util import get_sourced_components, load_model_from_config from ..util import get_sourced_components, load_model_from_config
from ._util import (
Arg,
Opt,
app,
import_code,
parse_config_overrides,
show_validation_error,
)
@app.command( @app.command(

View File

@ -1,11 +1,12 @@
from typing import Iterable, List, Optional
import random import random
from itertools import islice
import numpy
from pathlib import Path
import time import time
from tqdm import tqdm from itertools import islice
from pathlib import Path
from typing import Iterable, List, Optional
import numpy
import typer import typer
from tqdm import tqdm
from wasabi import msg from wasabi import msg
from .. import util from .. import util

View File

@ -1,18 +1,22 @@
from typing import Callable, Iterable, Mapping, Optional, Any, Union import itertools
from enum import Enum
from pathlib import Path
from wasabi import Printer
import srsly
import re import re
import sys import sys
import itertools from enum import Enum
from pathlib import Path
from typing import Any, Callable, Iterable, Mapping, Optional, Union
import srsly
from wasabi import Printer
from ._util import app, Arg, Opt, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs from ..training import docs_to_json
from ..training.converters import conllu_to_docs from ..training.converters import (
conll_ner_to_docs,
conllu_to_docs,
iob_to_docs,
json_to_docs,
)
from ._util import Arg, Opt, app, walk_directory
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
# matched by file extension and content. To add a converter, add a new # matched by file extension and content. To add a converter, add a new

View File

@ -1,15 +1,22 @@
from typing import Optional, Dict, Any, Union, List
from pathlib import Path from pathlib import Path
from wasabi import msg, table from typing import Any, Dict, List, Optional, Union
import typer
from thinc.api import Config from thinc.api import Config
from thinc.config import VARIABLE_RE from thinc.config import VARIABLE_RE
import typer from wasabi import msg, table
from ._util import Arg, Opt, show_validation_error, parse_config_overrides from .. import util
from ._util import import_code, debug_cli
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry from ..util import registry
from .. import util from ._util import (
Arg,
Opt,
debug_cli,
import_code,
parse_config_overrides,
show_validation_error,
)
@debug_cli.command( @debug_cli.command(

View File

@ -1,31 +1,49 @@
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
from typing import cast, overload
from pathlib import Path
from collections import Counter
import sys
import srsly
from wasabi import Printer, MESSAGES, msg
import typer
import math import math
import numpy import sys
from collections import Counter
from pathlib import Path
from typing import (
Any,
Dict,
Iterable,
List,
Optional,
Sequence,
Set,
Tuple,
Union,
cast,
overload,
)
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides import numpy
from ._util import import_code, debug_cli, _format_number import srsly
from ..training import Example, remove_bilu_prefix import typer
from ..training.initialize import get_sourced_components from wasabi import MESSAGES, Printer, msg
from ..schemas import ConfigSchemaTraining
from ..pipeline import TrainablePipe from .. import util
from ..compat import Literal
from ..language import Language
from ..morphology import Morphology
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer, SpanCategorizer from ..schemas import ConfigSchemaTraining
from ..pipeline._edit_tree_internals.edit_trees import EditTrees from ..training import Example, remove_bilu_prefix
from ..morphology import Morphology from ..training.initialize import get_sourced_components
from ..language import Language
from ..util import registry, resolve_dot_names from ..util import registry, resolve_dot_names
from ..compat import Literal
from ..vectors import Mode as VectorsMode from ..vectors import Mode as VectorsMode
from .. import util from ._util import (
Arg,
Opt,
_format_number,
app,
debug_cli,
import_code,
parse_config_overrides,
show_validation_error,
)
# Minimum number of expected occurrences of NER label in data to train new label # Minimum number of expected occurrences of NER label in data to train new label
NEW_LABEL_THRESHOLD = 50 NEW_LABEL_THRESHOLD = 50
@ -212,7 +230,7 @@ def debug_data(
else: else:
msg.info("No word vectors present in the package") msg.info("No word vectors present in the package")
if "spancat" in factory_names: if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
model_labels_spancat = _get_labels_from_spancat(nlp) model_labels_spancat = _get_labels_from_spancat(nlp)
has_low_data_warning = False has_low_data_warning = False
has_no_neg_warning = False has_no_neg_warning = False
@ -830,7 +848,7 @@ def _compile_gold(
data["boundary_cross_ents"] += 1 data["boundary_cross_ents"] += 1
elif label == "-": elif label == "-":
data["ner"]["-"] += 1 data["ner"]["-"] += 1
if "spancat" in factory_names: if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
for spans_key in list(eg.reference.spans.keys()): for spans_key in list(eg.reference.spans.keys()):
# Obtain the span frequency # Obtain the span frequency
if spans_key not in data["spancat"]: if spans_key not in data["spancat"]:
@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
pipe_names = [ pipe_names = [
pipe_name pipe_name
for pipe_name in nlp.pipe_names for pipe_name in nlp.pipe_names
if nlp.get_pipe_meta(pipe_name).factory == "spancat" if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
] ]
labels: Dict[str, Set[str]] = {} labels: Dict[str, Set[str]] = {}
for pipe_name in pipe_names: for pipe_name in pipe_names:

View File

@ -1,13 +1,13 @@
from pathlib import Path
from typing import Optional from typing import Optional
import typer import typer
from wasabi import Printer, diff_strings, MarkdownRenderer
from pathlib import Path
from thinc.api import Config from thinc.api import Config
from wasabi import MarkdownRenderer, Printer, diff_strings
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
from ..util import load_config from ..util import load_config
from .init_config import init_config, Optimizations from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
from .init_config import Optimizations, init_config
@debug_cli.command( @debug_cli.command(

View File

@ -1,19 +1,32 @@
from typing import Dict, Any, Optional
from pathlib import Path
import itertools import itertools
from pathlib import Path
from typing import Any, Dict, Optional
import typer
from thinc.api import (
Model,
data_validation,
fix_random_seed,
set_dropout_rate,
set_gpu_allocator,
)
from wasabi import msg
from spacy.training import Example from spacy.training import Example
from spacy.util import resolve_dot_names from spacy.util import resolve_dot_names
from wasabi import msg
from thinc.api import fix_random_seed, set_dropout_rate
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error from .. import util
from ._util import parse_config_overrides, string_to_list, setup_gpu
from ..schemas import ConfigSchemaTraining from ..schemas import ConfigSchemaTraining
from ..util import registry from ..util import registry
from .. import util from ._util import (
Arg,
Opt,
debug_cli,
parse_config_overrides,
setup_gpu,
show_validation_error,
string_to_list,
)
@debug_cli.command( @debug_cli.command(

View File

@ -1,14 +1,14 @@
from typing import Optional, Sequence
import requests
import sys import sys
from wasabi import msg from typing import Optional, Sequence
import typer
import requests
import typer
from wasabi import msg
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about from .. import about
from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version
from ..errors import OLD_MODEL_SHORTCUTS from ..errors import OLD_MODEL_SHORTCUTS
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
@app.command( @app.command(
@ -81,11 +81,8 @@ def download(
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
dl_tpl = "{m}-{v}/{m}-{v}{s}" dl_tpl = "{m}-{v}/{m}-{v}{s}"
egg_tpl = "#egg={m}=={v}"
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
filename = dl_tpl.format(m=model_name, v=version, s=suffix) filename = dl_tpl.format(m=model_name, v=version, s=suffix)
if sdist:
filename += egg_tpl.format(m=model_name, v=version)
return filename return filename

View File

@ -1,16 +1,16 @@
from typing import Optional, List, Dict, Any, Union
from wasabi import Printer
from pathlib import Path
import re import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
import srsly import srsly
from thinc.api import fix_random_seed from thinc.api import fix_random_seed
from wasabi import Printer
from ..training import Corpus from .. import displacy, util
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
from ..scorer import Scorer from ..scorer import Scorer
from .. import util from ..tokens import Doc
from .. import displacy from ..training import Corpus
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command( @benchmark_cli.command(
@ -27,6 +27,7 @@ def evaluate_cli(
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"), gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False), displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"), displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
# fmt: on # fmt: on
): ):
""" """
@ -50,6 +51,7 @@ def evaluate_cli(
gold_preproc=gold_preproc, gold_preproc=gold_preproc,
displacy_path=displacy_path, displacy_path=displacy_path,
displacy_limit=displacy_limit, displacy_limit=displacy_limit,
per_component=per_component,
silent=False, silent=False,
) )
@ -64,6 +66,7 @@ def evaluate(
displacy_limit: int = 25, displacy_limit: int = 25,
silent: bool = True, silent: bool = True,
spans_key: str = "sc", spans_key: str = "sc",
per_component: bool = False,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
msg = Printer(no_print=silent, pretty=not silent) msg = Printer(no_print=silent, pretty=not silent)
fix_random_seed() fix_random_seed()
@ -78,7 +81,16 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc) corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_dataset = list(corpus(nlp)) dev_dataset = list(corpus(nlp))
scores = nlp.evaluate(dev_dataset) scores = nlp.evaluate(dev_dataset, per_component=per_component)
if per_component:
data = scores
if output is None:
msg.warn(
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
)
else:
msg.info("Per-component scores will be saved to output JSON file.")
else:
metrics = { metrics = {
"TOK": "token_acc", "TOK": "token_acc",
"TAG": "tag_acc", "TAG": "tag_acc",
@ -122,6 +134,8 @@ def evaluate(
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit])) docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
render_deps = "parser" in factory_names render_deps = "parser" in factory_names
render_ents = "ner" in factory_names render_ents = "ner" in factory_names
render_spans = "spancat" in factory_names
render_parses( render_parses(
docs, docs,
displacy_path, displacy_path,
@ -129,6 +143,7 @@ def evaluate(
limit=displacy_limit, limit=displacy_limit,
deps=render_deps, deps=render_deps,
ents=render_ents, ents=render_ents,
spans=render_spans,
) )
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
@ -182,6 +197,7 @@ def render_parses(
limit: int = 250, limit: int = 250,
deps: bool = True, deps: bool = True,
ents: bool = True, ents: bool = True,
spans: bool = True,
): ):
docs[0].user_data["title"] = model_name docs[0].user_data["title"] = model_name
if ents: if ents:
@ -195,6 +211,11 @@ def render_parses(
with (output_path / "parses.html").open("w", encoding="utf8") as file_: with (output_path / "parses.html").open("w", encoding="utf8") as file_:
file_.write(html) file_.write(html)
if spans:
html = displacy.render(docs[:limit], style="span", page=True)
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
file_.write(html)
def print_prf_per_type( def print_prf_per_type(
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str

View File

@ -1,17 +1,17 @@
import functools import functools
import logging
import operator import operator
from pathlib import Path from pathlib import Path
import logging from typing import Any, Dict, List, Optional, Tuple
from typing import Optional, Tuple, Any, Dict, List
import numpy import numpy
import wasabi.tables import wasabi.tables
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
from ..errors import Errors
from ..training import Corpus
from ._util import app, Arg, Opt, import_code, setup_gpu
from .. import util from .. import util
from ..errors import Errors
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
from ..training import Corpus
from ._util import Arg, Opt, app, import_code, setup_gpu
_DEFAULTS = { _DEFAULTS = {
"n_trials": 11, "n_trials": 11,

View File

@ -1,15 +1,15 @@
from typing import Optional, Dict, Any, Union, List
import platform
import json import json
import platform
from pathlib import Path from pathlib import Path
from wasabi import Printer, MarkdownRenderer from typing import Any, Dict, List, Optional, Union
import srsly
from ._util import app, Arg, Opt, string_to_list import srsly
from .download import get_model_filename, get_latest_version from wasabi import MarkdownRenderer, Printer
from .. import util
from .. import about from .. import about, util
from ..compat import importlib_metadata from ..compat import importlib_metadata
from ._util import Arg, Opt, app, string_to_list
from .download import get_latest_version, get_model_filename
@app.command("info") @app.command("info")

View File

@ -1,19 +1,26 @@
from typing import Optional, List, Tuple import re
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from wasabi import Printer, diff_strings from typing import List, Optional, Tuple
from thinc.api import Config
import srsly import srsly
import re
from jinja2 import Template from jinja2 import Template
from thinc.api import Config
from wasabi import Printer, diff_strings
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import (
from ._util import string_to_list, import_code COMMAND,
Arg,
Opt,
import_code,
init_cli,
show_validation_error,
string_to_list,
)
ROOT = Path(__file__).parent / "templates" ROOT = Path(__file__).parent / "templates"
TEMPLATE_PATH = ROOT / "quickstart_training.jinja" TEMPLATE_PATH = ROOT / "quickstart_training.jinja"

View File

@ -1,15 +1,23 @@
from typing import Optional
import logging import logging
from pathlib import Path from pathlib import Path
from wasabi import msg from typing import Optional
import typer
import srsly import srsly
import typer
from wasabi import msg
from .. import util from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error from ..training.initialize import convert_vectors, init_nlp
from ._util import import_code, setup_gpu from ._util import (
Arg,
Opt,
import_code,
init_cli,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@init_cli.command("vectors") @init_cli.command("vectors")
@ -24,6 +32,7 @@ def init_vectors_cli(
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
# fmt: on # fmt: on
): ):
"""Convert word vectors for use with spaCy. Will export an nlp object that """Convert word vectors for use with spaCy. Will export an nlp object that
@ -42,6 +51,7 @@ def init_vectors_cli(
prune=prune, prune=prune,
name=name, name=name,
mode=mode, mode=mode,
attr=attr,
) )
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
nlp.to_disk(output_dir) nlp.to_disk(output_dir)

View File

@ -1,18 +1,18 @@
from typing import Optional, Union, Any, Dict, List, Tuple, cast
import shutil
from pathlib import Path
from wasabi import Printer, MarkdownRenderer, get_raw_input
from thinc.api import Config
from collections import defaultdict
from catalogue import RegistryError
import srsly
import sys
import re import re
import shutil
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX import srsly
from ..schemas import validate, ModelMetaSchema from catalogue import RegistryError
from .. import util from thinc.api import Config
from .. import about from wasabi import MarkdownRenderer, Printer, get_raw_input
from .. import about, util
from ..schemas import ModelMetaSchema, validate
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
@app.command("package") @app.command("package")

View File

@ -1,13 +1,21 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import typer
import re import re
from pathlib import Path
from typing import Optional
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.pretrain import pretrain from ..training.pretrain import pretrain
from ..util import load_config from ..util import load_config
from ._util import (
Arg,
Opt,
app,
import_code,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command( @app.command(

View File

@ -1,17 +1,18 @@
from typing import Optional, Sequence, Union, Iterator
import tqdm
from pathlib import Path
import srsly
import cProfile import cProfile
import itertools
import pstats import pstats
import sys import sys
import itertools from pathlib import Path
from wasabi import msg, Printer from typing import Iterator, Optional, Sequence, Union
import typer
import srsly
import tqdm
import typer
from wasabi import Printer, msg
from ._util import app, debug_cli, Arg, Opt, NAME
from ..language import Language from ..language import Language
from ..util import load_model from ..util import load_model
from ._util import NAME, Arg, Opt, app, debug_cli
@debug_cli.command("profile") @debug_cli.command("profile")

View File

@ -1,206 +0,0 @@
from typing import Any, Dict, Optional
from pathlib import Path
from wasabi import msg
import os
import re
import shutil
import requests
import typer
from ...util import ensure_path, working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
from .._util import get_checksum, download_file, git_checkout, get_git_version
from .._util import SimpleFrozenDict, parse_config_overrides
# Whether assets are extra if `extra` is not set.
EXTRA_DEFAULT = False
@project_cli.command(
"assets",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def project_assets_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
# fmt: on
):
"""Fetch project assets like datasets and pretrained weights. Assets are
defined in the "assets" section of the project.yml. If a checksum is
provided in the project.yml, the file is only downloaded if no local file
with the same checksum exists.
DOCS: https://spacy.io/api/cli#project-assets
"""
overrides = parse_config_overrides(ctx.args)
project_assets(
project_dir,
overrides=overrides,
sparse_checkout=sparse_checkout,
extra=extra,
)
def project_assets(
project_dir: Path,
*,
overrides: Dict[str, Any] = SimpleFrozenDict(),
sparse_checkout: bool = False,
extra: bool = False,
) -> None:
"""Fetch assets for a project using DVC if possible.
project_dir (Path): Path to project directory.
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
needed.
extra (bool): Whether to download all assets, including those marked as 'extra'.
"""
project_path = ensure_path(project_dir)
config = load_project_config(project_path, overrides=overrides)
assets = [
asset
for asset in config.get("assets", [])
if extra or not asset.get("extra", EXTRA_DEFAULT)
]
if not assets:
msg.warn(
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
exits=0,
)
msg.info(f"Fetching {len(assets)} asset(s)")
for asset in assets:
dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum")
if "git" in asset:
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. "
f"Make sure it's installed and that the executable is available."
)
get_git_version(error=git_err)
if dest.exists():
# If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest):
msg.good(
f"Skipping download with matching checksum: {asset['dest']}"
)
continue
else:
if dest.is_dir():
shutil.rmtree(dest)
else:
dest.unlink()
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
msg.fail(
"A git asset must include 'repo', the repository address.", exits=1
)
if "path" not in asset["git"] or asset["git"]["path"] is None:
msg.fail(
"A git asset must include 'path' - use \"\" to get the entire repository.",
exits=1,
)
git_checkout(
asset["git"]["repo"],
asset["git"]["path"],
dest,
branch=asset["git"].get("branch"),
sparse=sparse_checkout,
)
msg.good(f"Downloaded asset {dest}")
else:
url = asset.get("url")
if not url:
# project.yml defines asset without URL that the user has to place
check_private_asset(dest, checksum)
continue
fetch_asset(project_path, url, dest, checksum)
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
"""Check and validate assets without a URL (private assets that the user
has to provide themselves) and give feedback about the checksum.
dest (Path): Destination path of the asset.
checksum (Optional[str]): Optional checksum of the expected file.
"""
if not Path(dest).exists():
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
msg.warn(err)
else:
if not checksum:
msg.good(f"Asset already exists: {dest}")
elif checksum == get_checksum(dest):
msg.good(f"Asset exists with matching checksum: {dest}")
else:
msg.fail(f"Asset available but with incorrect checksum: {dest}")
def fetch_asset(
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
) -> None:
"""Fetch an asset from a given URL or path. If a checksum is provided and a
local file exists, it's only re-downloaded if the checksum doesn't match.
project_path (Path): Path to project directory.
url (str): URL or path to asset.
checksum (Optional[str]): Optional expected checksum of local file.
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
the asset failed.
"""
dest_path = (project_path / dest).resolve()
if dest_path.exists():
# If there's already a file, check for checksum
if checksum:
if checksum == get_checksum(dest_path):
msg.good(f"Skipping download with matching checksum: {dest}")
return
else:
# If there's not a checksum, make sure the file is a possibly valid size
if os.path.getsize(dest_path) == 0:
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
os.remove(dest_path)
# We might as well support the user here and create parent directories in
# case the asset dir isn't listed as a dir to create in the project.yml
if not dest_path.parent.exists():
dest_path.parent.mkdir(parents=True)
with working_dir(project_path):
url = convert_asset_url(url)
try:
download_file(url, dest_path)
msg.good(f"Downloaded asset {dest}")
except requests.exceptions.RequestException as e:
if Path(url).exists() and Path(url).is_file():
# If it's a local file, copy to destination
shutil.copy(url, str(dest_path))
msg.good(f"Copied local asset {dest}")
else:
msg.fail(f"Download failed: {dest}", e)
if checksum and checksum != get_checksum(dest_path):
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
def convert_asset_url(url: str) -> str:
"""Check and convert the asset URL if needed.
url (str): The asset URL.
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
if (
re.match(r"(http(s?)):\/\/github.com", url)
and "releases/download" not in url
and "/raw/" not in url
):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(
"Downloading from a regular GitHub URL. This will only download "
"the source of the page, not the actual file. Converting the URL "
"to a raw URL.",
converted,
)
return converted
return url

View File

@ -1,115 +0,0 @@
from typing import Optional
from pathlib import Path
from wasabi import msg
import subprocess
import re
from ... import about
from ...util import ensure_path
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
from .._util import git_checkout, get_git_version, git_repo_branch_exists
DEFAULT_REPO = about.__projects__
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
DEFAULT_BRANCHES = ["main", "master"]
@project_cli.command("clone")
def project_clone_cli(
# fmt: off
name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
# fmt: on
):
"""Clone a project template from a repository. Calls into "git" and will
only download the files from the given subdirectory. The GitHub repo
defaults to the official spaCy template repo, but can be customized
(including using a private repo).
DOCS: https://spacy.io/api/cli#project-clone
"""
if dest is None:
dest = Path.cwd() / Path(name).parts[-1]
if repo == DEFAULT_REPO and branch is None:
branch = DEFAULT_PROJECTS_BRANCH
if branch is None:
for default_branch in DEFAULT_BRANCHES:
if git_repo_branch_exists(repo, default_branch):
branch = default_branch
break
if branch is None:
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
msg.fail(
"No branch provided and attempted default "
f"branches {default_branches_msg} do not exist.",
exits=1,
)
else:
if not git_repo_branch_exists(repo, branch):
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
assert isinstance(branch, str)
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
def project_clone(
name: str,
dest: Path,
*,
repo: str = about.__projects__,
branch: str = about.__projects_branch__,
sparse_checkout: bool = False,
) -> None:
"""Clone a project template from a repository.
name (str): Name of subdirectory to clone.
dest (Path): Destination path of cloned project.
repo (str): URL of Git repo containing project templates.
branch (str): The branch to clone from
"""
dest = ensure_path(dest)
check_clone(name, dest, repo)
project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
try:
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
msg.fail(err, exits=1)
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
if not (project_dir / PROJECT_FILE).exists():
msg.warn(f"No {PROJECT_FILE} found in directory")
else:
msg.good(f"Your project is now ready!")
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
def check_clone(name: str, dest: Path, repo: str) -> None:
"""Check and validate that the destination path can be used to clone. Will
check that Git is available and that the destination path is suitable.
name (str): Name of the directory to clone from the repo.
dest (Path): Local destination of cloned directory.
repo (str): URL of the repo to clone from.
"""
git_err = (
f"Cloning spaCy project templates requires Git and the 'git' command. "
f"To clone a project without Git, copy the files from the '{name}' "
f"directory in the {repo} to {dest} manually."
)
get_git_version(error=git_err)
if not dest:
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
if dest.exists():
# Directory already exists (not allowed, clone needs to create it)
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
if not dest.parent.exists():
# We're not creating parents, parent dir should exist
msg.fail(
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
f"Create the necessary folder(s) first before continuing.",
exits=1,
)

View File

@ -1,115 +0,0 @@
from pathlib import Path
from wasabi import msg, MarkdownRenderer
from ...util import working_dir
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
DOCS_URL = "https://spacy.io"
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
project, as well as the available commands and workflows. For details, see the
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
INTRO_COMMANDS = f"""The following commands are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
Commands are only re-run if their inputs have changed."""
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
and will run the specified commands in order. Commands are only re-run if their
inputs have changed."""
INTRO_ASSETS = f"""The following assets are defined by the project. They can
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
in the project directory."""
# These markers are added to the Markdown and can be used to update the file in
# place if it already exists. Only the auto-generated part will be replaced.
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
# If this marker is used in an existing README, it's ignored and not replaced
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
@project_cli.command("document")
def project_document_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
# fmt: on
):
"""
Auto-generate a README.md for a project. If the content is saved to a file,
hidden markers are added so you can add custom content before or after the
auto-generated section and only the auto-generated docs will be replaced
when you re-run the command.
DOCS: https://spacy.io/api/cli#project-document
"""
project_document(project_dir, output_file, no_emoji=no_emoji)
def project_document(
project_dir: Path, output_file: Path, *, no_emoji: bool = False
) -> None:
is_stdout = str(output_file) == "-"
config = load_project_config(project_dir)
md = MarkdownRenderer(no_emoji=no_emoji)
md.add(MARKER_START)
title = config.get("title")
description = config.get("description")
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
if description:
md.add(description)
md.add(md.title(2, PROJECT_FILE, "📋"))
md.add(INTRO_PROJECT)
# Commands
cmds = config.get("commands", [])
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
if data:
md.add(md.title(3, "Commands", ""))
md.add(INTRO_COMMANDS)
md.add(md.table(data, ["Command", "Description"]))
# Workflows
wfs = config.get("workflows", {}).items()
data = [(md.code(n), " &rarr; ".join(md.code(w) for w in stp)) for n, stp in wfs]
if data:
md.add(md.title(3, "Workflows", ""))
md.add(INTRO_WORKFLOWS)
md.add(md.table(data, ["Workflow", "Steps"]))
# Assets
assets = config.get("assets", [])
data = []
for a in assets:
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
dest_path = a["dest"]
dest = md.code(dest_path)
if source == "Local":
# Only link assets if they're in the repo
with working_dir(project_dir) as p:
if (p / dest_path).exists():
dest = md.link(dest, dest_path)
data.append((dest, source, a.get("description", "")))
if data:
md.add(md.title(3, "Assets", "🗂"))
md.add(INTRO_ASSETS)
md.add(md.table(data, ["File", "Source", "Description"]))
md.add(MARKER_END)
# Output result
if is_stdout:
print(md.text)
else:
content = md.text
if output_file.exists():
with output_file.open("r", encoding="utf8") as f:
existing = f.read()
if MARKER_IGNORE in existing:
msg.warn("Found ignore marker in existing file: skipping", output_file)
return
if MARKER_START in existing and MARKER_END in existing:
msg.info("Found existing file: only replacing auto-generated docs")
before = existing.split(MARKER_START)[0]
after = existing.split(MARKER_END)[1]
content = f"{before}{content}{after}"
else:
msg.warn("Replacing existing file")
with output_file.open("w", encoding="utf8") as f:
f.write(content)
msg.good("Saved project documentation", output_file)

View File

@ -1,207 +0,0 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
from .._util import Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command
from ...util import SimpleFrozenList
DVC_CONFIG = "dvc.yaml"
DVC_DIR = ".dvc"
UPDATE_COMMAND = "dvc"
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
# edited your {PROJECT_FILE}, you can regenerate this file by running:
# {COMMAND} project {UPDATE_COMMAND}"""
@project_cli.command(UPDATE_COMMAND)
def project_update_dvc_cli(
# fmt: off
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
# fmt: on
):
"""Auto-generate Data Version Control (DVC) config. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. If no workflow is specified, the first defined
workflow is used. The DVC config will only be updated if the project.yml
changed.
DOCS: https://spacy.io/api/cli#project-dvc
"""
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
def project_update_dvc(
project_dir: Path,
workflow: Optional[str] = None,
*,
verbose: bool = False,
quiet: bool = False,
force: bool = False,
) -> None:
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
project can only define one pipeline, so you need to specify one workflow
defined in the project.yml. Will only update the file if the checksum changed.
project_dir (Path): The project directory.
workflow (Optional[str]): Optional name of workflow defined in project.yml.
If not set, the first workflow will be used.
verbose (bool): Print more info.
quiet (bool): Print less info.
force (bool): Force update DVC config.
"""
config = load_project_config(project_dir)
updated = update_dvc_config(
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
)
help_msg = "To execute the workflow with DVC, run: dvc repro"
if updated:
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
else:
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
def update_dvc_config(
path: Path,
config: Dict[str, Any],
workflow: Optional[str] = None,
verbose: bool = False,
quiet: bool = False,
force: bool = False,
) -> bool:
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
project directory. The file is auto-generated based on the config. The
first line of the auto-generated file specifies the hash of the config
dict, so if any of the config values change, the DVC config is regenerated.
path (Path): The path to the project directory.
config (Dict[str, Any]): The loaded project.yml.
verbose (bool): Whether to print additional info (via DVC).
quiet (bool): Don't output anything (via DVC).
force (bool): Force update, even if hashes match.
RETURNS (bool): Whether the DVC config file was updated.
"""
ensure_dvc(path)
workflows = config.get("workflows", {})
workflow_names = list(workflows.keys())
check_workflows(workflow_names, workflow)
if not workflow:
workflow = workflow_names[0]
config_hash = get_hash(config)
path = path.resolve()
dvc_config_path = path / DVC_CONFIG
if dvc_config_path.exists():
# Check if the file was generated using the current config, if not, redo
with dvc_config_path.open("r", encoding="utf8") as f:
ref_hash = f.readline().strip().replace("# ", "")
if ref_hash == config_hash and not force:
return False # Nothing has changed in project.yml, don't need to update
dvc_config_path.unlink()
dvc_commands = []
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
# some flags that apply to every command
flags = []
if verbose:
flags.append("--verbose")
if quiet:
flags.append("--quiet")
for name in workflows[workflow]:
command = config_commands[name]
deps = command.get("deps", [])
outputs = command.get("outputs", [])
outputs_no_cache = command.get("outputs_no_cache", [])
if not deps and not outputs and not outputs_no_cache:
continue
# Default to the working dir as the project path since dvc.yaml is auto-generated
# and we don't want arbitrary paths in there
project_cmd = ["python", "-m", NAME, "project", "run", name]
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
if command.get("no_skip"):
dvc_cmd.append("--always-changed")
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
dvc_commands.append(join_command(full_cmd))
if not dvc_commands:
# If we don't check for this, then there will be an error when reading the
# config, since DVC wouldn't create it.
msg.fail(
"No usable commands for DVC found. This can happen if none of your "
"commands have dependencies or outputs.",
exits=1,
)
with working_dir(path):
for c in dvc_commands:
dvc_command = "dvc " + c
run_command(dvc_command)
with dvc_config_path.open("r+", encoding="utf8") as f:
content = f.read()
f.seek(0, 0)
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
return True
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
"""Validate workflows provided in project.yml and check that a given
workflow can be used to generate a DVC config.
workflows (List[str]): Names of the available workflows.
workflow (Optional[str]): The name of the workflow to convert.
"""
if not workflows:
msg.fail(
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
f"define at least one list of commands.",
exits=1,
)
if workflow is not None and workflow not in workflows:
msg.fail(
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
f"Available workflows: {', '.join(workflows)}",
exits=1,
)
if not workflow:
msg.warn(
f"No workflow specified for DVC pipeline. Using the first workflow "
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
)
def ensure_dvc(project_dir: Path) -> None:
"""Ensure that the "dvc" command is available and that the current project
directory is an initialized DVC project.
"""
try:
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
except Exception:
msg.fail(
"To use spaCy projects with DVC (Data Version Control), DVC needs "
"to be installed and the 'dvc' command needs to be available",
"You can install the Python package from pip (pip install dvc) or "
"conda (conda install -c conda-forge dvc). For more details, see the "
"documentation: https://dvc.org/doc/install",
exits=1,
)
if not (project_dir / ".dvc").exists():
msg.fail(
"Project not initialized as a DVC project",
"To initialize a DVC project, you can run 'dvc init' in the project "
"directory. For more details, see the documentation: "
"https://dvc.org/doc/command-reference/init",
exits=1,
)

View File

@ -1,67 +0,0 @@
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_command_hash
from .._util import project_cli, Arg, logger
from .._util import load_project_config
from .run import update_lockfile
@project_cli.command("pull")
def project_pull_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Retrieve available precomputed outputs from a remote storage.
You can alias remotes in your project.yml by mapping them to storage paths.
A storage can be anything that the smart-open library can upload to, e.g.
AWS, Google Cloud Storage, SSH, local directories etc.
DOCS: https://spacy.io/api/cli#project-pull
"""
for url, output_path in project_pull(project_dir, remote):
if url is not None:
msg.good(f"Pulled {output_path} from {url}")
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# TODO: We don't have tests for this :(. It would take a bit of mockery to
# set up. I guess see if it breaks first?
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
commands = list(config.get("commands", []))
# We use a while loop here because we don't know how the commands
# will be ordered. A command might need dependencies from one that's later
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
"URL: %s for %s with command hash %s",
url,
output_path,
cmd_hash,
)
yield url, output_path
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
if all(loc.exists() for loc in out_locs):
update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that
# we iterate over the loop again.
commands.pop(i)
break
else:
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else:
# If we didn't break the for loop, break the while loop.
break

View File

@ -1,69 +0,0 @@
from pathlib import Path
from wasabi import msg
from .remote_storage import RemoteStorage
from .remote_storage import get_content_hash, get_command_hash
from .._util import load_project_config
from .._util import project_cli, Arg, logger
@project_cli.command("push")
def project_push_cli(
# fmt: off
remote: str = Arg("default", help="Name or path of remote storage"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
# fmt: on
):
"""Persist outputs to a remote storage. You can alias remotes in your
project.yml by mapping them to storage paths. A storage can be anything that
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
local directories etc.
DOCS: https://spacy.io/api/cli#project-push
"""
for output_path, url in project_push(project_dir, remote):
if url is None:
msg.info(f"Skipping {output_path}")
else:
msg.good(f"Pushed {output_path} to {url}")
def project_push(project_dir: Path, remote: str):
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
by mapping them to storage paths. A storage can be anything that the smart-open
library can upload to, e.g. gcs, aws, ssh, local directories etc
"""
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
url = storage.push(
output_path,
command_hash=cmd_hash,
content_hash=get_content_hash(output_loc),
)
logger.debug(
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
)
yield output_path, url
def _is_not_empty_dir(loc: Path):
if not loc.is_dir():
return True
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
return True
else:
return False

View File

@ -1,205 +0,0 @@
from typing import Optional, List, Dict, TYPE_CHECKING
import os
import site
import hashlib
import urllib.parse
import tarfile
from pathlib import Path
from wasabi import msg
from .._util import get_hash, get_checksum, upload_file, download_file
from .._util import ensure_pathy, make_tempdir
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
from ...git_info import GIT_VERSION
from ... import about
from ...errors import Errors
if TYPE_CHECKING:
from pathy import FluidPath # noqa: F401
class RemoteStorage:
"""Push and pull outputs to and from a remote file storage.
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
ssh, etc.
"""
def __init__(self, project_root: Path, url: str, *, compression="gz"):
self.root = project_root
self.url = ensure_pathy(url)
self.compression = compression
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Compress a file or directory within a project and upload it to a remote
storage. If an object exists at the full URL, nothing is done.
Within the remote storage, files are addressed by their project path
(url encoded) and two user-supplied hashes, representing their creation
context and their file contents. If the URL already exists, the data is
not uploaded. Paths are archived and compressed prior to upload.
"""
loc = self.root / path
if not loc.exists():
raise IOError(f"Cannot push {loc}: does not exist.")
url = self.make_url(path, command_hash, content_hash)
if url.exists():
return url
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / self.encode_name(str(path))
mode_string = f"w:{self.compression}" if self.compression else "w"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
tar_file.add(str(loc), arcname=str(path))
upload_file(tar_loc, url)
return url
def pull(
self,
path: Path,
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["FluidPath"]:
"""Retrieve a file from the remote cache. If the file already exists,
nothing is done.
If the command_hash and/or content_hash are specified, only matching
results are returned. If no results are available, an error is raised.
"""
dest = self.root / path
if dest.exists():
return None
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
if url is None:
return url
else:
# Make sure the destination exists
if not dest.parent.exists():
dest.parent.mkdir(parents=True)
tmp: Path
with make_tempdir() as tmp:
tar_loc = tmp / url.parts[-1]
download_file(url, tar_loc)
mode_string = f"r:{self.compression}" if self.compression else "r"
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
# This requires that the path is added correctly, relative
# to root. This is how we set things up in push()
# Disallow paths outside the current directory for the tar
# file (CVE-2007-4559, directory traversal vulnerability)
def is_within_directory(directory, target):
abs_directory = os.path.abspath(directory)
abs_target = os.path.abspath(target)
prefix = os.path.commonprefix([abs_directory, abs_target])
return prefix == abs_directory
def safe_extract(tar, path):
for member in tar.getmembers():
member_path = os.path.join(path, member.name)
if not is_within_directory(path, member_path):
raise ValueError(Errors.E852)
tar.extractall(path)
safe_extract(tar_file, self.root)
return url
def find(
self,
path: Path,
*,
command_hash: Optional[str] = None,
content_hash: Optional[str] = None,
) -> Optional["FluidPath"]:
"""Find the best matching version of a file within the storage,
or `None` if no match can be found. If both the creation and content hash
are specified, only exact matches will be returned. Otherwise, the most
recent matching file is preferred.
"""
name = self.encode_name(str(path))
urls = []
if command_hash is not None and content_hash is not None:
url = self.url / name / command_hash / content_hash
urls = [url] if url.exists() else []
elif command_hash is not None:
if (self.url / name / command_hash).exists():
urls = list((self.url / name / command_hash).iterdir())
else:
if (self.url / name).exists():
for sub_dir in (self.url / name).iterdir():
urls.extend(sub_dir.iterdir())
if content_hash is not None:
urls = [url for url in urls if url.parts[-1] == content_hash]
if len(urls) >= 2:
try:
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
except Exception:
msg.warn(
"Unable to sort remote files by last modified. The file(s) "
"pulled from the cache may not be the most recent."
)
return urls[-1] if urls else None
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
"""Construct a URL from a subpath, a creation hash and a content hash."""
return self.url / self.encode_name(str(path)) / command_hash / content_hash
def encode_name(self, name: str) -> str:
"""Encode a subpath into a URL-safe name."""
return urllib.parse.quote_plus(name)
def get_content_hash(loc: Path) -> str:
return get_checksum(loc)
def get_command_hash(
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
) -> str:
"""Create a hash representing the execution of a command. This includes the
currently installed packages, whatever environment variables have been marked
as relevant, and the command.
"""
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
spacy_v = GIT_VERSION
else:
spacy_v = str(get_minor_version(about.__version__) or "")
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
hashes.extend(cmd)
creation_bytes = "".join(hashes).encode("utf8")
return hashlib.md5(creation_bytes).hexdigest()
def get_site_hash():
"""Hash the current Python environment's site-packages contents, including
the name and version of the libraries. The list we're hashing is what
`pip freeze` would output.
"""
site_dirs = site.getsitepackages()
if site.ENABLE_USER_SITE:
site_dirs.extend(site.getusersitepackages())
packages = set()
for site_dir in site_dirs:
site_dir = Path(site_dir)
for subpath in site_dir.iterdir():
if subpath.parts[-1].endswith("dist-info"):
packages.add(subpath.parts[-1].replace(".dist-info", ""))
package_bytes = "".join(sorted(packages)).encode("utf8")
return hashlib.md5sum(package_bytes).hexdigest()
def get_env_hash(env: Dict[str, str]) -> str:
"""Construct a hash of the environment variables that will be passed into
the commands.
Values in the env dict may be references to the current os.environ, using
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
"""
env_vars = {}
for key, value in env.items():
if value.startswith("$"):
env_vars[key] = os.environ.get(value[1:], "")
else:
env_vars[key] = value
return get_hash(env_vars)

View File

@ -1,360 +0,0 @@
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
import os.path
from pathlib import Path
from wasabi import msg
from wasabi.util import locale_escape
import sys
import srsly
import typer
from ... import about
from ...git_info import GIT_VERSION
from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
from ...util import check_bool_env_var, SimpleFrozenDict
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
@project_cli.command(
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
)
def project_run_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
# fmt: on
):
"""Run a named command or workflow defined in the project.yml. If a workflow
name is specified, all commands in the workflow are run, in order. If
commands define dependencies and/or outputs, they will only be re-run if
state has changed.
DOCS: https://spacy.io/api/cli#project-run
"""
if show_help or not subcommand:
print_run_help(project_dir, subcommand)
else:
overrides = parse_config_overrides(ctx.args)
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
def project_run(
project_dir: Path,
subcommand: str,
*,
overrides: Dict[str, Any] = SimpleFrozenDict(),
force: bool = False,
dry: bool = False,
capture: bool = False,
skip_requirements_check: bool = False,
) -> None:
"""Run a named script defined in the project.yml. If the script is part
of the default pipeline (defined in the "run" section), DVC is used to
execute the command, so it can determine whether to rerun it. It then
calls into "exec" to execute it.
project_dir (Path): Path to project directory.
subcommand (str): Name of command to run.
overrides (Dict[str, Any]): Optional config overrides.
force (bool): Force re-running, even if nothing changed.
dry (bool): Perform a dry run and don't execute commands.
capture (bool): Whether to capture the output and errors of individual commands.
If False, the stdout and stderr will not be redirected, and if there's an error,
sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
skip_requirements_check (bool): Whether to skip the requirements check.
"""
config = load_project_config(project_dir, overrides=overrides)
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
workflows = config.get("workflows", {})
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
req_path = project_dir / "requirements.txt"
if not skip_requirements_check:
if config.get("check_requirements", True) and os.path.exists(req_path):
with req_path.open() as requirements_file:
_check_requirements([req.strip() for req in requirements_file])
if subcommand in workflows:
msg.info(f"Running workflow '{subcommand}'")
for cmd in workflows[subcommand]:
project_run(
project_dir,
cmd,
overrides=overrides,
force=force,
dry=dry,
capture=capture,
skip_requirements_check=True,
)
else:
cmd = commands[subcommand]
for dep in cmd.get("deps", []):
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
err_exits = 1 if not dry else None
msg.fail(err, err_help, exits=err_exits)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
msg.divider(subcommand)
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
if not rerun and not force:
msg.info(f"Skipping '{cmd['name']}': nothing changed")
else:
run_commands(cmd["script"], dry=dry, capture=capture)
if not dry:
update_lockfile(current_dir, cmd)
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
"""Simulate a CLI help prompt using the info available in the project.yml.
project_dir (Path): The project directory.
subcommand (Optional[str]): The subcommand or None. If a subcommand is
provided, the subcommand help is shown. Otherwise, the top-level help
and a list of available commands is printed.
"""
config = load_project_config(project_dir)
config_commands = config.get("commands", [])
commands = {cmd["name"]: cmd for cmd in config_commands}
workflows = config.get("workflows", {})
project_loc = "" if is_cwd(project_dir) else project_dir
if subcommand:
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
if subcommand in commands:
help_text = commands[subcommand].get("help")
if help_text:
print(f"\n{help_text}\n")
elif subcommand in workflows:
steps = workflows[subcommand]
print(f"\nWorkflow consisting of {len(steps)} commands:")
steps_data = [
(f"{i + 1}. {step}", commands[step].get("help", ""))
for i, step in enumerate(steps)
]
msg.table(steps_data)
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
print(f"For command details, run: {help_cmd}")
else:
print("")
title = config.get("title")
if title:
print(f"{locale_escape(title)}\n")
if config_commands:
print(f"Available commands in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
if workflows:
print(f"Available workflows in {PROJECT_FILE}")
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
def run_commands(
commands: Iterable[str] = SimpleFrozenList(),
silent: bool = False,
dry: bool = False,
capture: bool = False,
) -> None:
"""Run a sequence of commands in a subprocess, in order.
commands (List[str]): The string commands.
silent (bool): Don't print the commands.
dry (bool): Perform a dry run and don't execut anything.
capture (bool): Whether to capture the output and errors of individual commands.
If False, the stdout and stderr will not be redirected, and if there's an error,
sys.exit will be called with the return code. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
"""
for c in commands:
command = split_command(c)
# Not sure if this is needed or a good idea. Motivation: users may often
# use commands in their config that reference "python" and we want to
# make sure that it's always executing the same Python that spaCy is
# executed with and the pip in the same env, not some other Python/pip.
# Also ensures cross-compatibility if user 1 writes "python3" (because
# that's how it's set up on their system), and user 2 without the
# shortcut tries to re-run the command.
if len(command) and command[0] in ("python", "python3"):
command[0] = sys.executable
elif len(command) and command[0] in ("pip", "pip3"):
command = [sys.executable, "-m", "pip", *command[1:]]
if not silent:
print(f"Running command: {join_command(command)}")
if not dry:
run_command(command, capture=capture)
def validate_subcommand(
commands: Sequence[str], workflows: Sequence[str], subcommand: str
) -> None:
"""Check that a subcommand is valid and defined. Raises an error otherwise.
commands (Sequence[str]): The available commands.
subcommand (str): The subcommand.
"""
if not commands and not workflows:
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
if subcommand not in commands and subcommand not in workflows:
help_msg = []
if subcommand in ["assets", "asset"]:
help_msg.append("Did you mean to run: python -m spacy project assets?")
if commands:
help_msg.append(f"Available commands: {', '.join(commands)}")
if workflows:
help_msg.append(f"Available workflows: {', '.join(workflows)}")
msg.fail(
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
". ".join(help_msg),
exits=1,
)
def check_rerun(
project_dir: Path,
command: Dict[str, Any],
*,
check_spacy_version: bool = True,
check_spacy_commit: bool = False,
) -> bool:
"""Check if a command should be rerun because its settings or inputs/outputs
changed.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
strict_version (bool):
RETURNS (bool): Whether to re-run the command.
"""
# Always rerun if no-skip is set
if command.get("no_skip", False):
return True
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists(): # We don't have a lockfile, run command
return True
data = srsly.read_yaml(lock_path)
if command["name"] not in data: # We don't have info about this command
return True
entry = data[command["name"]]
# Always run commands with no outputs (otherwise they'd always be skipped)
if not entry.get("outs", []):
return True
# Always rerun if spaCy version or commit hash changed
spacy_v = entry.get("spacy_version")
commit = entry.get("spacy_git_version")
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
return True
if check_spacy_commit and commit != GIT_VERSION:
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
return True
# If the entry in the lockfile matches the lockfile entry that would be
# generated from the current command, we don't rerun because it means that
# all inputs/outputs, hashes and scripts are the same and nothing changed
lock_entry = get_lock_entry(project_dir, command)
exclude = ["spacy_version", "spacy_git_version"]
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
"""Update the lockfile after running a command. Will create a lockfile if
it doesn't yet exist and will add an entry for the current command, its
script and dependencies/outputs.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
"""
lock_path = project_dir / PROJECT_LOCK
if not lock_path.exists():
srsly.write_yaml(lock_path, {})
data = {}
else:
data = srsly.read_yaml(lock_path)
data[command["name"]] = get_lock_entry(project_dir, command)
srsly.write_yaml(lock_path, data)
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
"""Get a lockfile entry for a given command. An entry includes the command,
the script (command steps) and a list of dependencies and outputs with
their paths and file hashes, if available. The format is based on the
dvc.lock files, to keep things consistent.
project_dir (Path): The current project directory.
command (Dict[str, Any]): The command, as defined in the project.yml.
RETURNS (Dict[str, Any]): The lockfile entry.
"""
deps = get_fileinfo(project_dir, command.get("deps", []))
outs = get_fileinfo(project_dir, command.get("outputs", []))
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
return {
"cmd": f"{COMMAND} run {command['name']}",
"script": command["script"],
"deps": deps,
"outs": [*outs, *outs_nc],
"spacy_version": about.__version__,
"spacy_git_version": GIT_VERSION,
}
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
"""Generate the file information for a list of paths (dependencies, outputs).
Includes the file path and the file's checksum.
project_dir (Path): The current project directory.
paths (List[str]): The file paths.
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
"""
data = []
for path in paths:
file_path = project_dir / path
md5 = get_checksum(file_path) if file_path.exists() else None
data.append({"path": path, "md5": md5})
return data
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
"""Checks whether requirements are installed and free of version conflicts.
requirements (List[str]): List of requirements.
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
exist.
"""
import pkg_resources
failed_pkgs_msgs: List[str] = []
conflicting_pkgs_msgs: List[str] = []
for req in requirements:
try:
pkg_resources.require(req)
except pkg_resources.DistributionNotFound as dnf:
failed_pkgs_msgs.append(dnf.report())
except pkg_resources.VersionConflict as vc:
conflicting_pkgs_msgs.append(vc.report())
except Exception:
msg.warn(
f"Unable to check requirement: {req} "
"Checks are currently limited to requirement specifiers "
"(PEP 508)"
)
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
msg.warn(
title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
"correctly and you installed all requirements specified in your project's requirements.txt: "
)
for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
msg.text(pgk_msg)
return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0

View File

@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #} can help generate the best possible configuration, given a user's requirements. #}
{%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%} {%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
[paths] [paths]
train = null train = null
dev = null dev = null
@ -28,7 +28,7 @@ lang = "{{ lang }}"
tok2vec/transformer. #} tok2vec/transformer. #}
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} {%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} {%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} {%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
{%- else -%} {%- else -%}
{%- set full_pipeline = components -%} {%- set full_pipeline = components -%}
@ -127,6 +127,30 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
{% endif -%} {% endif -%}
{% if "span_finder" in components -%}
[components.span_finder]
factory = "span_finder"
max_length = 25
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.span_finder.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "spancat" in components -%} {% if "spancat" in components -%}
[components.spancat] [components.spancat]
factory = "spancat" factory = "spancat"
@ -392,6 +416,27 @@ nO = null
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
{% endif %} {% endif %}
{% if "span_finder" in components %}
[components.span_finder]
factory = "span_finder"
max_length = 25
min_length = null
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
spans_key = "sc"
threshold = 0.5
[components.span_finder.model]
@architectures = "spacy.SpanFinder.v1"
[components.span_finder.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = 2
[components.span_finder.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "spancat" in components %} {% if "spancat" in components %}
[components.spancat] [components.spancat]
factory = "spancat" factory = "spancat"

View File

@ -1,15 +1,23 @@
from typing import Optional, Dict, Any, Union
from pathlib import Path
from wasabi import msg
import typer
import logging import logging
import sys import sys
from pathlib import Path
from typing import Any, Dict, Optional, Union
import typer
from wasabi import msg
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ..training.loop import train as train_nlp
from ..training.initialize import init_nlp
from .. import util from .. import util
from ..training.initialize import init_nlp
from ..training.loop import train as train_nlp
from ._util import (
Arg,
Opt,
app,
import_code,
parse_config_overrides,
setup_gpu,
show_validation_error,
)
@app.command( @app.command(

View File

@ -1,14 +1,21 @@
from typing import Tuple
from pathlib import Path
import sys import sys
import requests
from wasabi import msg, Printer
import warnings import warnings
from pathlib import Path
from typing import Tuple
import requests
from wasabi import Printer, msg
from ._util import app
from .. import about from .. import about
from ..util import get_package_version, get_installed_models, get_minor_version from ..util import (
from ..util import get_package_path, get_model_meta, is_compatible_version get_installed_models,
get_minor_version,
get_model_meta,
get_package_path,
get_package_version,
is_compatible_version,
)
from ._util import app
@app.command("validate") @app.command("validate")

View File

@ -1,5 +1,6 @@
"""Helpers for Python and platform compatibility.""" """Helpers for Python and platform compatibility."""
import sys import sys
from thinc.util import copy_array from thinc.util import copy_array
try: try:

View File

@ -26,6 +26,9 @@ batch_size = 1000
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"
[nlp.vectors]
@vectors = "spacy.Vectors.v1"
# The pipeline components and their models # The pipeline components and their models
[components] [components]

View File

@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
DOCS: https://spacy.io/api/top-level#displacy DOCS: https://spacy.io/api/top-level#displacy
USAGE: https://spacy.io/usage/visualizers USAGE: https://spacy.io/usage/visualizers
""" """
from typing import Union, Iterable, Optional, Dict, Any, Callable
import warnings import warnings
from typing import Any, Callable, Dict, Iterable, Optional, Union
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
from ..tokens import Doc, Span
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import is_in_jupyter from ..tokens import Doc, Span
from ..util import find_available_port from ..util import find_available_port, is_in_jupyter
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
_html = {} _html = {}
RENDER_WRAPPER = None RENDER_WRAPPER = None
@ -68,7 +66,7 @@ def render(
if jupyter or (jupyter is None and is_in_jupyter()): if jupyter or (jupyter is None and is_in_jupyter()):
# return HTML rendered by IPython display() # return HTML rendered by IPython display()
# See #4840 for details on span wrapper to disable mathjax # See #4840 for details on span wrapper to disable mathjax
from IPython.core.display import display, HTML from IPython.core.display import HTML, display
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html))) return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
return html return html

View File

@ -1,15 +1,28 @@
from typing import Any, Dict, List, Optional, Tuple, Union
import uuid import uuid
import itertools from typing import Any, Dict, List, Optional, Tuple, Union
from ..errors import Errors from ..errors import Errors
from ..util import escape_html, minify_html, registry from ..util import escape_html, minify_html, registry
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS from .templates import (
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS TPL_DEP_ARCS,
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN TPL_DEP_SVG,
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL TPL_DEP_WORDS,
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS TPL_DEP_WORDS_LEMMA,
from .templates import TPL_TITLE TPL_ENT,
TPL_ENT_RTL,
TPL_ENTS,
TPL_FIGURE,
TPL_KB_LINK,
TPL_PAGE,
TPL_SPAN,
TPL_SPAN_RTL,
TPL_SPAN_SLICE,
TPL_SPAN_SLICE_RTL,
TPL_SPAN_START,
TPL_SPAN_START_RTL,
TPL_SPANS,
TPL_TITLE,
)
DEFAULT_LANG = "en" DEFAULT_LANG = "en"
DEFAULT_DIR = "ltr" DEFAULT_DIR = "ltr"
@ -204,7 +217,7 @@ class SpanRenderer:
+ (self.offset_step * (len(entities) - 1)) + (self.offset_step * (len(entities) - 1))
) )
markup += self.span_template.format( markup += self.span_template.format(
text=token["text"], text=escape_html(token["text"]),
span_slices=slices, span_slices=slices,
span_starts=starts, span_starts=starts,
total_height=total_height, total_height=total_height,

View File

@ -1,4 +1,5 @@
import warnings import warnings
from .compat import Literal from .compat import Literal
@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -549,12 +553,12 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'") "during training, make sure to include it in 'annotating components'")
# New errors added in v3.x # New errors added in v3.x
E849 = ("The vocab only supports {method} for vectors of type "
"spacy.vectors.Vectors, not {vectors_type}.")
E850 = ("The PretrainVectors objective currently only supports default or " E850 = ("The PretrainVectors objective currently only supports default or "
"floret vectors, not {mode} vectors.") "floret vectors, not {mode} vectors.")
E851 = ("The 'textcat' component labels should only have values of 0 or 1, " E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
"but found value of '{val}'.") "but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path "
"traversal.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is " E853 = ("Unsupported component factory name '{name}'. The character '.' is "
"not permitted in factory names.") "not permitted in factory names.")
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not " E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
@ -738,8 +742,8 @@ class Errors(metaclass=ErrorsWithCodes):
"model from a shortcut, which is obsolete as of spaCy v3.0. To " "model from a shortcut, which is obsolete as of spaCy v3.0. To "
"load the model, use its full name instead:\n\n" "load the model, use its full name instead:\n\n"
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available " "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
"models, see the models directory: https://spacy.io/models. If you " "models, see the models directory: https://spacy.io/models and if "
"want to create a blank model, use spacy.blank: " "you want to create a blank model, use spacy.blank: "
"nlp = spacy.blank(\"{name}\")") "nlp = spacy.blank(\"{name}\")")
E942 = ("Executing `after_{name}` callback failed. Expected the function to " E942 = ("Executing `after_{name}` callback failed. Expected the function to "
"return an initialized nlp object but got: {value}. Maybe " "return an initialized nlp object but got: {value}. Maybe "
@ -970,6 +974,15 @@ class Errors(metaclass=ErrorsWithCodes):
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_select_port=True` to pick an available port automatically.") "or use `auto_select_port=True` to pick an available port automatically.")
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
E1052 = ("Unable to copy spans: the character offsets for the span at "
"index {i} in the span group do not align with the tokenization "
"in the target doc.")
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
" 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.")
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
"but only callbacks with one or three parameters are supported")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -1,4 +1,5 @@
import warnings import warnings
from .errors import Warnings from .errors import Warnings

View File

@ -1,3 +1,3 @@
from .candidate import Candidate, get_candidates, get_candidates_batch
from .kb import KnowledgeBase from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, get_candidates, get_candidates_batch

View File

@ -1,8 +1,11 @@
from .kb cimport KnowledgeBase
from libcpp.vector cimport vector from libcpp.vector cimport vector
from ..typedefs cimport hash_t
# Object used by the Entity Linker that summarizes one entity-alias candidate combination. from ..typedefs cimport hash_t
from .kb cimport KnowledgeBase
# Object used by the Entity Linker that summarizes one entity-alias candidate
# combination.
cdef class Candidate: cdef class Candidate:
cdef readonly KnowledgeBase kb cdef readonly KnowledgeBase kb
cdef hash_t entity_hash cdef hash_t entity_hash

View File

@ -1,19 +1,31 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Iterable from typing import Iterable
from .kb cimport KnowledgeBase from .kb cimport KnowledgeBase
from ..tokens import Span from ..tokens import Span
cdef class Candidate: cdef class Candidate:
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved """A `Candidate` object refers to a textual mention (`alias`) that may or
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking may not be resolved to a specific `entity` from a Knowledge Base. This
algorithm which will disambiguate the various candidates to the correct one. will be used as input for the entity linking algorithm which will
disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned a certain prior probability. Each candidate (alias, entity) pair is assigned a certain prior probability.
DOCS: https://spacy.io/api/kb/#candidate-init DOCS: https://spacy.io/api/kb/#candidate-init
""" """
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): def __init__(
self,
KnowledgeBase kb,
entity_hash,
entity_freq,
entity_vector,
alias_hash,
prior_prob
):
self.kb = kb self.kb = kb
self.entity_hash = entity_hash self.entity_hash = entity_hash
self.entity_freq = entity_freq self.entity_freq = entity_freq
@ -56,7 +68,8 @@ cdef class Candidate:
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
""" """
Return candidate entities for a given mention and fetching appropriate entries from the index. Return candidate entities for a given mention and fetching appropriate
entries from the index.
kb (KnowledgeBase): Knowledge base to query. kb (KnowledgeBase): Knowledge base to query.
mention (Span): Entity mention for which to identify candidates. mention (Span): Entity mention for which to identify candidates.
RETURNS (Iterable[Candidate]): Identified candidates. RETURNS (Iterable[Candidate]): Identified candidates.
@ -64,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
return kb.get_candidates(mention) return kb.get_candidates(mention)
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: def get_candidates_batch(
kb: KnowledgeBase, mentions: Iterable[Span]
) -> Iterable[Iterable[Candidate]]:
""" """
Return candidate entities for the given mentions and fetching appropriate entries from the index. Return candidate entities for the given mentions and fetching appropriate entries
from the index.
kb (KnowledgeBase): Knowledge base to query. kb (KnowledgeBase): Knowledge base to query.
mention (Iterable[Span]): Entity mentions for which to identify candidates. mention (Iterable[Span]): Entity mentions for which to identify candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.

View File

@ -2,8 +2,10 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
from ..vocab cimport Vocab from ..vocab cimport Vocab
cdef class KnowledgeBase: cdef class KnowledgeBase:
cdef Pool mem cdef Pool mem
cdef readonly Vocab vocab cdef readonly Vocab vocab

View File

@ -2,17 +2,19 @@
from pathlib import Path from pathlib import Path
from typing import Iterable, Tuple, Union from typing import Iterable, Tuple, Union
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .candidate import Candidate from ..errors import Errors
from ..tokens import Span from ..tokens import Span
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from ..errors import Errors from .candidate import Candidate
cdef class KnowledgeBase: cdef class KnowledgeBase:
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, """A `KnowledgeBase` instance stores unique identifiers for entities and
to support entity linking of named entities to real-world concepts. their textual aliases, to support entity linking of named entities to
real-world concepts.
This is an abstract class and requires its operations to be implemented. This is an abstract class and requires its operations to be implemented.
DOCS: https://spacy.io/api/kb DOCS: https://spacy.io/api/kb
@ -30,10 +32,13 @@ cdef class KnowledgeBase:
self.entity_vector_length = entity_vector_length self.entity_vector_length = entity_vector_length
self.mem = Pool() self.mem = Pool()
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: def get_candidates_batch(
self, mentions: Iterable[Span]
) -> Iterable[Iterable[Candidate]]:
""" """
Return candidate entities for specified texts. Each candidate defines the entity, the original alias, Return candidate entities for specified texts. Each candidate defines
and the prior probability of that alias resolving to that entity. the entity, the original alias, and the prior probability of that
alias resolving to that entity.
If no candidate is found for a given text, an empty list is returned. If no candidate is found for a given text, an empty list is returned.
mentions (Iterable[Span]): Mentions for which to get candidates. mentions (Iterable[Span]): Mentions for which to get candidates.
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
@ -42,14 +47,17 @@ cdef class KnowledgeBase:
def get_candidates(self, mention: Span) -> Iterable[Candidate]: def get_candidates(self, mention: Span) -> Iterable[Candidate]:
""" """
Return candidate entities for specified text. Each candidate defines the entity, the original alias, Return candidate entities for specified text. Each candidate defines
the entity, the original alias,
and the prior probability of that alias resolving to that entity. and the prior probability of that alias resolving to that entity.
If the no candidate is found for a given text, an empty list is returned. If the no candidate is found for a given text, an empty list is returned.
mention (Span): Mention for which to get candidates. mention (Span): Mention for which to get candidates.
RETURNS (Iterable[Candidate]): Identified candidates. RETURNS (Iterable[Candidate]): Identified candidates.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="get_candidates", name=self.__name__
)
) )
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]: def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
@ -67,7 +75,9 @@ cdef class KnowledgeBase:
RETURNS (Iterable[float]): Vector for specified entity. RETURNS (Iterable[float]): Vector for specified entity.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="get_vector", name=self.__name__
)
) )
def to_bytes(self, **kwargs) -> bytes: def to_bytes(self, **kwargs) -> bytes:
@ -75,7 +85,9 @@ cdef class KnowledgeBase:
RETURNS (bytes): Current state as binary string. RETURNS (bytes): Current state as binary string.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="to_bytes", name=self.__name__
)
) )
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()): def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
@ -84,25 +96,35 @@ cdef class KnowledgeBase:
exclude (Tuple[str]): Properties to exclude when restoring KB. exclude (Tuple[str]): Properties to exclude when restoring KB.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="from_bytes", name=self.__name__
)
) )
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: def to_disk(
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
""" """
Write KnowledgeBase content to disk. Write KnowledgeBase content to disk.
path (Union[str, Path]): Target file path. path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude. exclude (Iterable[str]): List of components to exclude.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="to_disk", name=self.__name__
)
) )
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None: def from_disk(
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
""" """
Load KnowledgeBase content from disk. Load KnowledgeBase content from disk.
path (Union[str, Path]): Target file path. path (Union[str, Path]): Target file path.
exclude (Iterable[str]): List of components to exclude. exclude (Iterable[str]): List of components to exclude.
""" """
raise NotImplementedError( raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__) Errors.E1045.format(
parent="KnowledgeBase", method="from_disk", name=self.__name__
)
) )

View File

@ -1,11 +1,11 @@
"""Knowledge-base for entity or concept linking.""" """Knowledge-base for entity or concept linking."""
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libc.stdint cimport int32_t, int64_t from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport FILE from libc.stdio cimport FILE
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
from ..structs cimport AliasC, KBEntryC
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..structs cimport KBEntryC, AliasC
from .kb cimport KnowledgeBase from .kb cimport KnowledgeBase
ctypedef vector[KBEntryC] entry_vec ctypedef vector[KBEntryC] entry_vec
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# optional data, we can let users configure a DB as the backend for this. # optional data, we can let users configure a DB as the backend for this.
cdef object _features_table cdef object _features_table
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil: cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
"""Add an entity vector to the vectors table.""" """Add an entity vector to the vectors table."""
cdef int64_t new_index = self._vectors_table.size() cdef int64_t new_index = self._vectors_table.size()
self._vectors_table.push_back(entity_vector) self._vectors_table.push_back(entity_vector)
return new_index return new_index
cdef inline int64_t c_add_entity(
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq, self,
int32_t vector_index, int feats_row) nogil: hash_t entity_hash,
float freq,
int32_t vector_index,
int feats_row
) nogil:
"""Add an entry to the vector of entries. """Add an entry to the vector of entries.
After calling this method, make sure to update also the _entry_index using the return value""" After calling this method, make sure to update also the _entry_index
using the return value"""
# This is what we'll map the entity hash key to. It's where the entry will sit # This is what we'll map the entity hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later. # in the vector of entries, so we can get it later.
cdef int64_t new_index = self._entries.size() cdef int64_t new_index = self._entries.size()
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 # Avoid struct initializer to enable nogil, cf.
# https://github.com/cython/cython/issues/1642
cdef KBEntryC entry cdef KBEntryC entry
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.vector_index = vector_index entry.vector_index = vector_index
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
self._entries.push_back(entry) self._entries.push_back(entry)
return new_index return new_index
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil: cdef inline int64_t c_add_aliases(
"""Connect a mention to a list of potential entities with their prior probabilities . self,
After calling this method, make sure to update also the _alias_index using the return value""" hash_t alias_hash,
# This is what we'll map the alias hash key to. It's where the alias will be defined vector[int64_t] entry_indices,
# in the vector of aliases. vector[float] probs
) nogil:
"""Connect a mention to a list of potential entities with their prior
probabilities. After calling this method, make sure to update also the
_alias_index using the return value"""
# This is what we'll map the alias hash key to. It's where the alias will be
# defined in the vector of aliases.
cdef int64_t new_index = self._aliases_table.size() cdef int64_t new_index = self._aliases_table.size()
# Avoid struct initializer to enable nogil # Avoid struct initializer to enable nogil
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
""" """
Initializing the vectors and making sure the first element of each vector is a dummy, Initializing the vectors and making sure the first element of each vector is a
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value dummy, because the PreshMap maps pointing to indices in these vectors can not
contain 0 as value.
cf. https://github.com/explosion/preshed/issues/17 cf. https://github.com/explosion/preshed/issues/17
""" """
cdef int32_t dummy_value = 0 cdef int32_t dummy_value = 0
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
cdef class Writer: cdef class Writer:
cdef FILE* _fp cdef FILE* _fp
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1 cdef int write_header(
self, int64_t nr_entries, int64_t entity_vector_length
) except -1
cdef int write_vector_element(self, float element) except -1 cdef int write_vector_element(self, float element) except -1
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1 cdef int write_entry(
self, hash_t entry_hash, float entry_freq, int32_t vector_index
) except -1
cdef int write_alias_length(self, int64_t alias_length) except -1 cdef int write_alias_length(self, int64_t alias_length) except -1
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 cdef int write_alias_header(
self, hash_t alias_hash, int64_t candidate_length
) except -1
cdef int write_alias(self, int64_t entry_index, float prob) except -1 cdef int write_alias(self, int64_t entry_index, float prob) except -1
cdef int _write(self, void* value, size_t size) except -1 cdef int _write(self, void* value, size_t size) except -1
@ -143,12 +161,18 @@ cdef class Writer:
cdef class Reader: cdef class Reader:
cdef FILE* _fp cdef FILE* _fp
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1 cdef int read_header(
self, int64_t* nr_entries, int64_t* entity_vector_length
) except -1
cdef int read_vector_element(self, float* element) except -1 cdef int read_vector_element(self, float* element) except -1
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1 cdef int read_entry(
self, hash_t* entity_hash, float* freq, int32_t* vector_index
) except -1
cdef int read_alias_length(self, int64_t* alias_length) except -1 cdef int read_alias_length(self, int64_t* alias_length) except -1
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 cdef int read_alias_header(
self, hash_t* alias_hash, int64_t* candidate_length
) except -1
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
cdef int _read(self, void* value, size_t size) except -1 cdef int _read(self, void* value, size_t size) except -1

View File

@ -1,29 +1,35 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
from typing import Iterable, Callable, Dict, Any, Union from typing import Any, Callable, Dict, Iterable
import srsly import srsly
from preshed.maps cimport PreshMap
from cpython.exc cimport PyErr_SetFromErrno
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
from libc.stdint cimport int32_t, int64_t
from libcpp.vector cimport vector
from pathlib import Path from cpython.exc cimport PyErr_SetFromErrno
from libc.stdint cimport int32_t, int64_t
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
from libcpp.vector cimport vector
from preshed.maps cimport PreshMap
import warnings import warnings
from pathlib import Path
from ..tokens import Span from ..tokens import Span
from ..typedefs cimport hash_t from ..typedefs cimport hash_t
from ..errors import Errors, Warnings
from .. import util from .. import util
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList, ensure_path from ..util import SimpleFrozenList, ensure_path
from ..vocab cimport Vocab from ..vocab cimport Vocab
from .kb cimport KnowledgeBase from .kb cimport KnowledgeBase
from .candidate import Candidate as Candidate from .candidate import Candidate as Candidate
cdef class InMemoryLookupKB(KnowledgeBase): cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, """An `InMemoryLookupKB` instance stores unique identifiers for entities
to support entity linking of named entities to real-world concepts. and their textual aliases, to support entity linking of named entities to
real-world concepts.
DOCS: https://spacy.io/api/inmemorylookupkb DOCS: https://spacy.io/api/inmemorylookupkb
""" """
@ -66,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
def add_entity(self, str entity, float freq, vector[float] entity_vector): def add_entity(self, str entity, float freq, vector[float] entity_vector):
""" """
Add an entity to the KB, optionally specifying its log probability based on corpus frequency Add an entity to the KB, optionally specifying its log probability
based on corpus frequency.
Return the hash of the entity ID/name at the end. Return the hash of the entity ID/name at the end.
""" """
cdef hash_t entity_hash = self.vocab.strings.add(entity) cdef hash_t entity_hash = self.vocab.strings.add(entity)
@ -78,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# Raise an error if the provided entity vector is not of the correct length # Raise an error if the provided entity vector is not of the correct length
if len(entity_vector) != self.entity_vector_length: if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) raise ValueError(
Errors.E141.format(
found=len(entity_vector), required=self.entity_vector_length
)
)
vector_index = self.c_add_vector(entity_vector=entity_vector) vector_index = self.c_add_vector(entity_vector=entity_vector)
new_index = self.c_add_entity(entity_hash=entity_hash, new_index = self.c_add_entity(
entity_hash=entity_hash,
freq=freq, freq=freq,
vector_index=vector_index, vector_index=vector_index,
feats_row=-1) # Features table currently not implemented feats_row=-1
) # Features table currently not implemented
self._entry_index[entity_hash] = new_index self._entry_index[entity_hash] = new_index
return entity_hash return entity_hash
@ -110,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
else: else:
entity_vector = vector_list[i] entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length: if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) raise ValueError(
Errors.E141.format(
found=len(entity_vector),
required=self.entity_vector_length
)
)
entry.entity_hash = entity_hash entry.entity_hash = entity_hash
entry.freq = freq_list[i] entry.freq = freq_list[i]
@ -144,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
previous_alias_nr = self.get_size_aliases() previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same # Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities): if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias, raise ValueError(
Errors.E132.format(
alias=alias,
entities_length=len(entities), entities_length=len(entities),
probabilities_length=len(probabilities))) probabilities_length=len(probabilities))
)
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors) # Throw an error if the probabilities sum up to more than 1 (allow for
# some rounding errors)
prob_sum = sum(probabilities) prob_sum = sum(probabilities)
if prob_sum > 1.00001: if prob_sum > 1.00001:
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
@ -165,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
for entity, prob in zip(entities, probabilities): for entity, prob in zip(entities, probabilities):
entity_hash = self.vocab.strings[entity] entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index: if entity_hash not in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity)) raise ValueError(Errors.E134.format(entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash) entry_index = <int64_t>self._entry_index.get(entity_hash)
entry_indices.push_back(int(entry_index)) entry_indices.push_back(int(entry_index))
probs.push_back(float(prob)) probs.push_back(float(prob))
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) new_index = self.c_add_aliases(
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
)
self._alias_index[alias_hash] = new_index self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases(): if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias)) raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash return alias_hash
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False): def append_alias(
self, str alias, str entity, float prior_prob, ignore_warnings=False
):
""" """
For an alias already existing in the KB, extend its potential entities with one more. For an alias already existing in the KB, extend its potential entities
with one more.
Throw a warning if either the alias or the entity is unknown, Throw a warning if either the alias or the entity is unknown,
or when the combination is already previously recorded. or when the combination is already previously recorded.
Throw an error if this entity+prior prob would exceed the sum of 1. Throw an error if this entity+prior prob would exceed the sum of 1.
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one. For efficiency, it's best to use the method `add_alias` as much as
possible instead of this one.
""" """
# Check if the alias exists in the KB # Check if the alias exists in the KB
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
if not alias_hash in self._alias_index: if alias_hash not in self._alias_index:
raise ValueError(Errors.E176.format(alias=alias)) raise ValueError(Errors.E176.format(alias=alias))
# Check if the entity exists in the KB # Check if the entity exists in the KB
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index: if entity_hash not in self._entry_index:
raise ValueError(Errors.E134.format(entity=entity)) raise ValueError(Errors.E134.format(entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash) entry_index = <int64_t>self._entry_index.get(entity_hash)
# Throw an error if the prior probabilities (including the new one) sum up to more than 1 # Throw an error if the prior probabilities (including the new one)
# sum up to more than 1
alias_index = <int64_t>self._alias_index.get(alias_hash) alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
current_sum = sum([p for p in alias_entry.probs]) current_sum = sum([p for p in alias_entry.probs])
@ -231,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
def get_alias_candidates(self, str alias) -> Iterable[Candidate]: def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
""" """
Return candidate entities for an alias. Each candidate defines the entity, the original alias, Return candidate entities for an alias. Each candidate defines the
and the prior probability of that alias resolving to that entity. entity, the original alias, and the prior probability of that alias
resolving to that entity.
If the alias is not known in the KB, and empty list is returned. If the alias is not known in the KB, and empty list is returned.
""" """
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
if not alias_hash in self._alias_index: if alias_hash not in self._alias_index:
return [] return []
alias_index = <int64_t>self._alias_index.get(alias_hash) alias_index = <int64_t>self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
@ -244,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return [Candidate(kb=self, return [Candidate(kb=self,
entity_hash=self._entries[entry_index].entity_hash, entity_hash=self._entries[entry_index].entity_hash,
entity_freq=self._entries[entry_index].freq, entity_freq=self._entries[entry_index].freq,
entity_vector=self._vectors_table[self._entries[entry_index].vector_index], entity_vector=self._vectors_table[
self._entries[entry_index].vector_index
],
alias_hash=alias_hash, alias_hash=alias_hash,
prior_prob=prior_prob) prior_prob=prior_prob)
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prior_prob) in zip(
alias_entry.entry_indices, alias_entry.probs
)
if entry_index != 0] if entry_index != 0]
def get_vector(self, str entity): def get_vector(self, str entity):
@ -261,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
return self._vectors_table[self._entries[entry_index].vector_index] return self._vectors_table[self._entries[entry_index].vector_index]
def get_prior_prob(self, str entity, str alias): def get_prior_prob(self, str entity, str alias):
""" Return the prior probability of a given alias being linked to a given entity, """ Return the prior probability of a given alias being linked to a
or return 0.0 when this combination is not known in the knowledge base""" given entity, or return 0.0 when this combination is not known in the
knowledge base."""
cdef hash_t alias_hash = self.vocab.strings[alias] cdef hash_t alias_hash = self.vocab.strings[alias]
cdef hash_t entity_hash = self.vocab.strings[entity] cdef hash_t entity_hash = self.vocab.strings[entity]
@ -273,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
entry_index = self._entry_index[entity_hash] entry_index = self._entry_index[entity_hash]
alias_entry = self._aliases_table[alias_index] alias_entry = self._aliases_table[alias_index]
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs): for (entry_index, prior_prob) in zip(
alias_entry.entry_indices, alias_entry.probs
):
if self._entries[entry_index].entity_hash == entity_hash: if self._entries[entry_index].entity_hash == entity_hash:
return prior_prob return prior_prob
@ -283,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
""" """
def serialize_header(): def serialize_header():
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length) header = (
self.get_size_entities(),
self.get_size_aliases(),
self.entity_vector_length
)
return srsly.json_dumps(header) return srsly.json_dumps(header)
def serialize_entries(): def serialize_entries():
i = 1 i = 1
tuples = [] tuples = []
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): for entry_hash, entry_index in sorted(
self._entry_index.items(), key=lambda x: x[1]
):
entry = self._entries[entry_index] entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash assert entry.entity_hash == entry_hash
assert entry_index == i assert entry_index == i
@ -302,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
headers = [] headers = []
indices_lists = [] indices_lists = []
probs_lists = [] probs_lists = []
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): for alias_hash, alias_index in sorted(
self._alias_index.items(), key=lambda x: x[1]
):
alias = self._aliases_table[alias_index] alias = self._aliases_table[alias_index]
assert alias_index == i assert alias_index == i
candidate_length = len(alias.entry_indices) candidate_length = len(alias.entry_indices)
@ -360,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
indices = srsly.json_loads(all_data[1]) indices = srsly.json_loads(all_data[1])
probs = srsly.json_loads(all_data[2]) probs = srsly.json_loads(all_data[2])
for header, indices, probs in zip(headers, indices, probs): for header, indices, probs in zip(headers, indices, probs):
alias_hash, candidate_length = header alias_hash, _candidate_length = header
alias.entry_indices = indices alias.entry_indices = indices
alias.probs = probs alias.probs = probs
self._aliases_table[i] = alias self._aliases_table[i] = alias
@ -409,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
writer.write_vector_element(element) writer.write_vector_element(element)
i = i+1 i = i+1
# dumping the entry records in the order in which they are in the _entries vector. # dumping the entry records in the order in which they are in the
# index 0 is a dummy object not stored in the _entry_index and can be ignored. # _entries vector.
# index 0 is a dummy object not stored in the _entry_index and can
# be ignored.
i = 1 i = 1
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): for entry_hash, entry_index in sorted(
self._entry_index.items(), key=lambda x: x[1]
):
entry = self._entries[entry_index] entry = self._entries[entry_index]
assert entry.entity_hash == entry_hash assert entry.entity_hash == entry_hash
assert entry_index == i assert entry_index == i
@ -424,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
# dumping the aliases in the order in which they are in the _alias_index vector. # dumping the aliases in the order in which they are in the _alias_index vector.
# index 0 is a dummy object not stored in the _aliases_table and can be ignored. # index 0 is a dummy object not stored in the _aliases_table and can be ignored.
i = 1 i = 1
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): for alias_hash, alias_index in sorted(
self._alias_index.items(), key=lambda x: x[1]
):
alias = self._aliases_table[alias_index] alias = self._aliases_table[alias_index]
assert alias_index == i assert alias_index == i
@ -530,7 +581,8 @@ cdef class Writer:
def __init__(self, path): def __init__(self, path):
assert isinstance(path, Path) assert isinstance(path, Path)
content = bytes(path) content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content cdef bytes bytes_loc = content.encode('utf8') \
if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'wb') self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp: if not self._fp:
raise IOError(Errors.E146.format(path=path)) raise IOError(Errors.E146.format(path=path))
@ -540,14 +592,18 @@ cdef class Writer:
cdef size_t status = fclose(self._fp) cdef size_t status = fclose(self._fp)
assert status == 0 assert status == 0
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1: cdef int write_header(
self, int64_t nr_entries, int64_t entity_vector_length
) except -1:
self._write(&nr_entries, sizeof(nr_entries)) self._write(&nr_entries, sizeof(nr_entries))
self._write(&entity_vector_length, sizeof(entity_vector_length)) self._write(&entity_vector_length, sizeof(entity_vector_length))
cdef int write_vector_element(self, float element) except -1: cdef int write_vector_element(self, float element) except -1:
self._write(&element, sizeof(element)) self._write(&element, sizeof(element))
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1: cdef int write_entry(
self, hash_t entry_hash, float entry_freq, int32_t vector_index
) except -1:
self._write(&entry_hash, sizeof(entry_hash)) self._write(&entry_hash, sizeof(entry_hash))
self._write(&entry_freq, sizeof(entry_freq)) self._write(&entry_freq, sizeof(entry_freq))
self._write(&vector_index, sizeof(vector_index)) self._write(&vector_index, sizeof(vector_index))
@ -556,7 +612,9 @@ cdef class Writer:
cdef int write_alias_length(self, int64_t alias_length) except -1: cdef int write_alias_length(self, int64_t alias_length) except -1:
self._write(&alias_length, sizeof(alias_length)) self._write(&alias_length, sizeof(alias_length))
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1: cdef int write_alias_header(
self, hash_t alias_hash, int64_t candidate_length
) except -1:
self._write(&alias_hash, sizeof(alias_hash)) self._write(&alias_hash, sizeof(alias_hash))
self._write(&candidate_length, sizeof(candidate_length)) self._write(&candidate_length, sizeof(candidate_length))
@ -572,16 +630,19 @@ cdef class Writer:
cdef class Reader: cdef class Reader:
def __init__(self, path): def __init__(self, path):
content = bytes(path) content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content cdef bytes bytes_loc = content.encode('utf8') \
if type(content) == str else content
self._fp = fopen(<char*>bytes_loc, 'rb') self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp: if not self._fp:
PyErr_SetFromErrno(IOError) PyErr_SetFromErrno(IOError)
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header fseek(self._fp, 0, 0) # this can be 0 if there is no header
def __dealloc__(self): def __dealloc__(self):
fclose(self._fp) fclose(self._fp)
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1: cdef int read_header(
self, int64_t* nr_entries, int64_t* entity_vector_length
) except -1:
status = self._read(nr_entries, sizeof(int64_t)) status = self._read(nr_entries, sizeof(int64_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
@ -601,7 +662,9 @@ cdef class Reader:
return 0 # end of file return 0 # end of file
raise IOError(Errors.E145.format(param="vector element")) raise IOError(Errors.E145.format(param="vector element"))
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1: cdef int read_entry(
self, hash_t* entity_hash, float* freq, int32_t* vector_index
) except -1:
status = self._read(entity_hash, sizeof(hash_t)) status = self._read(entity_hash, sizeof(hash_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):
@ -632,7 +695,9 @@ cdef class Reader:
return 0 # end of file return 0 # end of file
raise IOError(Errors.E145.format(param="alias length")) raise IOError(Errors.E145.format(param="alias length"))
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1: cdef int read_alias_header(
self, hash_t* alias_hash, int64_t* candidate_length
) except -1:
status = self._read(alias_hash, sizeof(hash_t)) status = self._read(alias_hash, sizeof(hash_t))
if status < 1: if status < 1:
if feof(self._fp): if feof(self._fp):

View File

@ -1,5 +1,5 @@
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class AfrikaansDefaults(BaseDefaults): class AfrikaansDefaults(BaseDefaults):

View File

@ -1,12 +1,11 @@
from .stop_words import STOP_WORDS from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class AmharicDefaults(BaseDefaults): class AmharicDefaults(BaseDefaults):

View File

@ -1,5 +1,11 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import (
from ..char_classes import UNITS, ALPHA_UPPER ALPHA_UPPER,
CURRENCY,
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split() _list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()

View File

@ -1,5 +1,4 @@
from ...symbols import ORTH, NORM from ...symbols import NORM, ORTH
_exc = {} _exc = {}

View File

@ -1,8 +1,8 @@
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from ...language import Language, BaseDefaults
class ArabicDefaults(BaseDefaults): class ArabicDefaults(BaseDefaults):

View File

@ -1,5 +1,11 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import (
from ..char_classes import UNITS, ALPHA_UPPER ALPHA_UPPER,
CURRENCY,
LIST_ELLIPSES,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_suffixes = ( _suffixes = (
LIST_PUNCT LIST_PUNCT

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,6 +1,6 @@
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from .stop_words import STOP_WORDS
class AzerbaijaniDefaults(BaseDefaults): class AzerbaijaniDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# Eleven, twelve etc. are written separate: on bir, on iki # Eleven, twelve etc. are written separate: on bir, on iki
_num_words = [ _num_words = [

View File

@ -1,12 +1,14 @@
from ...attrs import LANG
from ...language import BaseDefaults, Language
from ...util import update_exc
from ..punctuation import (
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
)
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults
from ...attrs import LANG
from ...util import update_exc
class BulgarianDefaults(BaseDefaults): class BulgarianDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"нула", "нула",
"едно", "едно",

View File

@ -4,8 +4,7 @@ References:
(countries, occupations, fields of studies and more). (countries, occupations, fields of studies and more).
""" """
from ...symbols import ORTH, NORM from ...symbols import NORM, ORTH
_exc = {} _exc = {}

View File

@ -1,10 +1,12 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class BengaliDefaults(BaseDefaults): class BengaliDefaults(BaseDefaults):

View File

@ -1,6 +1,14 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import (
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS ALPHA,
ALPHA_LOWER,
CONCAT_QUOTES,
HYPHENS,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
UNITS,
)
_currency = r"\$¢£€¥฿৳" _currency = r"\$¢£€¥฿৳"
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,14 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults
from .lemmatizer import CatalanLemmatizer from .lemmatizer import CatalanLemmatizer
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class CatalanDefaults(BaseDefaults): class CatalanDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"zero", "zero",
"un", "un",

View File

@ -1,9 +1,18 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import (
from ..char_classes import LIST_CURRENCY ALPHA,
from ..char_classes import CURRENCY ALPHA_LOWER,
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT ALPHA_UPPER,
from ..char_classes import merge_chars, _units CONCAT_QUOTES,
CURRENCY,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
_units,
merge_chars,
)
ELISION = " ' ".strip().replace(" ", "").replace("\n", "") ELISION = " ' ".strip().replace(" ", "").replace("\n", "")

View File

@ -1,7 +1,8 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PROPN
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,6 +1,6 @@
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from .stop_words import STOP_WORDS
class CzechDefaults(BaseDefaults): class CzechDefaults(BaseDefaults):

View File

@ -1,9 +1,9 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class DanishDefaults(BaseDefaults): class DanishDefaults(BaseDefaults):

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
# Source http://fjern-uv.dk/tal.php # Source http://fjern-uv.dk/tal.php
_num_words = """nul _num_words = """nul
en et to tre fire fem seks syv otte ni ti en et to tre fire fem seks syv otte ni ti

View File

@ -1,8 +1,13 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import (
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
LIST_ELLIPSES,
LIST_ICONS,
)
from ..punctuation import TOKENIZER_SUFFIXES from ..punctuation import TOKENIZER_SUFFIXES
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_infixes = ( _infixes = (

View File

@ -1,7 +1,8 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...tokens import Doc, Span
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
from ...errors import Errors from ...errors import Errors
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
from ...tokens import Doc, Span
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:

View File

@ -2,10 +2,9 @@
Tokenizer Exceptions. Tokenizer Exceptions.
Source: https://forkortelse.dk/ and various others. Source: https://forkortelse.dk/ and various others.
""" """
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,8 +1,8 @@
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language, BaseDefaults from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GermanDefaults(BaseDefaults): class GermanDefaults(BaseDefaults):

View File

@ -1,9 +1,18 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import (
from ..char_classes import CURRENCY, UNITS, PUNCT ALPHA,
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
PUNCT,
UNITS,
)
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES _prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
_suffixes = ( _suffixes = (

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span from ...tokens import Doc, Span

View File

@ -1,7 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = { _exc = {
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}], "auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],

View File

@ -1,6 +1,6 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language, BaseDefaults
class LowerSorbianDefaults(BaseDefaults): class LowerSorbianDefaults(BaseDefaults):

View File

@ -1,13 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...language import Language, BaseDefaults from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class GreekDefaults(BaseDefaults): class GreekDefaults(BaseDefaults):

View File

@ -1,5 +1,6 @@
def get_pos_from_wiktionary(): def get_pos_from_wiktionary():
import re import re
from gensim.corpora.wikicorpus import extract_pages from gensim.corpora.wikicorpus import extract_pages
regex = re.compile(r"==={{(\w+)\|el}}===") regex = re.compile(r"==={{(\w+)\|el}}===")

View File

@ -1,6 +1,16 @@
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from ..char_classes import (
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS ALPHA,
from ..char_classes import CONCAT_QUOTES, CURRENCY ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
CURRENCY,
HYPHENS,
LIST_CURRENCY,
LIST_ELLIPSES,
LIST_ICONS,
LIST_PUNCT,
LIST_QUOTES,
)
_units = ( _units = (
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft " "km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span from ...tokens import Doc, Span

View File

@ -1,6 +1,6 @@
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import NORM, ORTH
from ...symbols import ORTH, NORM
from ...util import update_exc from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc = {} _exc = {}

View File

@ -1,13 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer from .lemmatizer import EnglishLemmatizer
from ...language import Language, BaseDefaults from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class EnglishDefaults(BaseDefaults): class EnglishDefaults(BaseDefaults):

View File

@ -1,5 +1,12 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS from ..char_classes import (
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA ALPHA,
ALPHA_LOWER,
ALPHA_UPPER,
CONCAT_QUOTES,
HYPHENS,
LIST_ELLIPSES,
LIST_ICONS,
)
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES

View File

@ -1,7 +1,7 @@
from typing import Union, Iterator, Tuple from typing import Iterator, Tuple, Union
from ...symbols import NOUN, PROPN, PRON
from ...errors import Errors from ...errors import Errors
from ...symbols import NOUN, PRON, PROPN
from ...tokens import Doc, Span from ...tokens import Doc, Span

View File

@ -1,8 +1,8 @@
from typing import Dict, List from typing import Dict, List
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ...symbols import ORTH, NORM
from ...util import update_exc
from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
_exc: Dict[str, List[Dict]] = {} _exc: Dict[str, List[Dict]] = {}
_exclude = [ _exclude = [

View File

@ -1,12 +1,14 @@
from typing import Optional, Callable from typing import Callable, Optional
from thinc.api import Model from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .lemmatizer import SpanishLemmatizer from .lemmatizer import SpanishLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language, BaseDefaults from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class SpanishDefaults(BaseDefaults): class SpanishDefaults(BaseDefaults):

View File

@ -1,5 +1,5 @@
from typing import List, Optional, Tuple
import re import re
from typing import List, Optional, Tuple
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token

View File

@ -1,6 +1,5 @@
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"cero", "cero",
"uno", "uno",

Some files were not shown because too many files have changed in this diff Show More