mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Merge branch 'develop' into feature/pymorphy-lemmatizer-diacritics
This commit is contained in:
commit
135a28a89d
118
.github/azure-steps.yml
vendored
118
.github/azure-steps.yml
vendored
|
@ -1,118 +0,0 @@
|
||||||
parameters:
|
|
||||||
python_version: ''
|
|
||||||
architecture: 'x64'
|
|
||||||
num_build_jobs: 2
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- task: UsePythonVersion@0
|
|
||||||
inputs:
|
|
||||||
versionSpec: ${{ parameters.python_version }}
|
|
||||||
architecture: ${{ parameters.architecture }}
|
|
||||||
allowUnstable: true
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
|
||||||
displayName: 'Set variables'
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install -U build pip setuptools
|
|
||||||
python -m pip install -U -r requirements.txt
|
|
||||||
displayName: "Install dependencies"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m build --sdist
|
|
||||||
displayName: "Build sdist"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m mypy spacy
|
|
||||||
displayName: 'Run mypy'
|
|
||||||
condition: ne(variables['python_version'], '3.6')
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
|
||||||
inputs:
|
|
||||||
contents: "spacy"
|
|
||||||
displayName: "Delete source directory"
|
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
|
||||||
inputs:
|
|
||||||
contents: "*.egg-info"
|
|
||||||
displayName: "Delete egg-info directory"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip freeze > installed.txt
|
|
||||||
python -m pip uninstall -y -r installed.txt
|
|
||||||
displayName: "Uninstall all packages"
|
|
||||||
|
|
||||||
- bash: |
|
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
|
||||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
|
||||||
displayName: "Install from sdist"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -W error -c "import spacy"
|
|
||||||
displayName: "Test import"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy download ca_core_news_sm
|
|
||||||
python -m spacy download ca_core_news_md
|
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
|
||||||
displayName: 'Test download CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
|
||||||
displayName: 'Test download_url in info CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
|
||||||
displayName: 'Test no warnings on load (#11713)'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
|
||||||
displayName: 'Test convert CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy init config -p ner -l ca ner.cfg
|
|
||||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
|
||||||
displayName: 'Test debug config CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
# will have errors due to sparse data, check for summary in output
|
|
||||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
|
||||||
displayName: 'Test debug data CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
|
||||||
displayName: 'Test train CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
|
||||||
displayName: 'Test assemble CLI'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
|
||||||
displayName: 'Test assemble CLI vectors warning'
|
|
||||||
condition: eq(variables['python_version'], '3.9')
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install -U -r requirements.txt
|
|
||||||
displayName: "Install test requirements"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pytest --pyargs spacy -W error
|
|
||||||
displayName: "Run CPU tests"
|
|
||||||
|
|
||||||
- script: |
|
|
||||||
python -m pip install 'spacy[apple]'
|
|
||||||
python -m pytest --pyargs spacy
|
|
||||||
displayName: "Run CPU tests with thinc-apple-ops"
|
|
||||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
|
65
.github/workflows/tests.yml
vendored
65
.github/workflows/tests.yml
vendored
|
@ -37,10 +37,20 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install black -c requirements.txt
|
python -m pip install black -c requirements.txt
|
||||||
python -m black spacy --check
|
python -m black spacy --check
|
||||||
|
- name: isort
|
||||||
|
run: |
|
||||||
|
python -m pip install isort -c requirements.txt
|
||||||
|
python -m isort spacy --check
|
||||||
- name: flake8
|
- name: flake8
|
||||||
run: |
|
run: |
|
||||||
python -m pip install flake8==5.0.4
|
python -m pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
|
- name: cython-lint
|
||||||
|
run: |
|
||||||
|
python -m pip install cython-lint -c requirements.txt
|
||||||
|
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||||
|
cython-lint spacy --ignore E501,W291,E266
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
name: Test
|
name: Test
|
||||||
needs: Validate
|
needs: Validate
|
||||||
|
@ -107,22 +117,22 @@ jobs:
|
||||||
- name: Test import
|
- name: Test import
|
||||||
run: python -W error -c "import spacy"
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
- name: "Test download CLI"
|
# - name: "Test download CLI"
|
||||||
run: |
|
# run: |
|
||||||
python -m spacy download ca_core_news_sm
|
# python -m spacy download ca_core_news_sm
|
||||||
python -m spacy download ca_core_news_md
|
# python -m spacy download ca_core_news_md
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
- name: "Test download_url in info CLI"
|
# - name: "Test download_url in info CLI"
|
||||||
run: |
|
# run: |
|
||||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
- name: "Test no warnings on load (#11713)"
|
# - name: "Test no warnings on load (#11713)"
|
||||||
run: |
|
# run: |
|
||||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test convert CLI"
|
- name: "Test convert CLI"
|
||||||
run: |
|
run: |
|
||||||
|
@ -146,17 +156,17 @@ jobs:
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test assemble CLI"
|
# - name: "Test assemble CLI"
|
||||||
run: |
|
# run: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
- name: "Test assemble CLI vectors warning"
|
# - name: "Test assemble CLI vectors warning"
|
||||||
run: |
|
# run: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Install test requirements"
|
- name: "Install test requirements"
|
||||||
run: |
|
run: |
|
||||||
|
@ -165,6 +175,7 @@ jobs:
|
||||||
- name: "Run CPU tests"
|
- name: "Run CPU tests"
|
||||||
run: |
|
run: |
|
||||||
python -m pytest --pyargs spacy -W error
|
python -m pytest --pyargs spacy -W error
|
||||||
|
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||||
|
|
||||||
- name: "Run CPU tests with thinc-apple-ops"
|
- name: "Run CPU tests with thinc-apple-ops"
|
||||||
run: |
|
run: |
|
||||||
|
|
4
Makefile
4
Makefile
|
@ -1,11 +1,11 @@
|
||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
|
|
||||||
ifndef SPACY_EXTRAS
|
ifndef SPACY_EXTRAS
|
||||||
override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sudachipy sudachidict_core pymorphy2
|
override SPACY_EXTRAS = spacy-lookups-data==1.0.3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifndef PYVER
|
ifndef PYVER
|
||||||
override PYVER = 3.6
|
override PYVER = 3.8
|
||||||
endif
|
endif
|
||||||
|
|
||||||
VENV := ./env$(PYVER)
|
VENV := ./env$(PYVER)
|
||||||
|
|
|
@ -36,7 +36,7 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
| Documentation | |
|
| Documentation | |
|
||||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ---------------------------------------------------------------------- |
|
||||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
|
@ -44,6 +44,7 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
|
@ -57,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
|
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||||
|
|
|
@ -1,120 +0,0 @@
|
||||||
trigger:
|
|
||||||
batch: true
|
|
||||||
branches:
|
|
||||||
include:
|
|
||||||
- "*"
|
|
||||||
exclude:
|
|
||||||
- "spacy.io"
|
|
||||||
- "nightly.spacy.io"
|
|
||||||
- "v2.spacy.io"
|
|
||||||
paths:
|
|
||||||
exclude:
|
|
||||||
- "website/*"
|
|
||||||
- "*.md"
|
|
||||||
- "*.mdx"
|
|
||||||
- ".github/workflows/*"
|
|
||||||
pr:
|
|
||||||
paths:
|
|
||||||
exclude:
|
|
||||||
- "*.md"
|
|
||||||
- "*.mdx"
|
|
||||||
- "website/docs/*"
|
|
||||||
- "website/src/*"
|
|
||||||
- "website/meta/*.tsx"
|
|
||||||
- "website/meta/*.mjs"
|
|
||||||
- "website/meta/languages.json"
|
|
||||||
- "website/meta/site.json"
|
|
||||||
- "website/meta/sidebars.json"
|
|
||||||
- "website/meta/type-annotations.json"
|
|
||||||
- "website/pages/*"
|
|
||||||
- ".github/workflows/*"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
# Check formatting and linting. Perform basic checks for most important errors
|
|
||||||
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
|
|
||||||
# selected codes.
|
|
||||||
- job: "Validate"
|
|
||||||
pool:
|
|
||||||
vmImage: "ubuntu-latest"
|
|
||||||
steps:
|
|
||||||
- task: UsePythonVersion@0
|
|
||||||
inputs:
|
|
||||||
versionSpec: "3.7"
|
|
||||||
- script: |
|
|
||||||
pip install black -c requirements.txt
|
|
||||||
python -m black spacy --check
|
|
||||||
displayName: "black"
|
|
||||||
- script: |
|
|
||||||
pip install flake8==5.0.4
|
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
|
||||||
displayName: "flake8"
|
|
||||||
- script: |
|
|
||||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
||||||
displayName: 'Validate website/meta/universe.json'
|
|
||||||
|
|
||||||
- job: "Test"
|
|
||||||
dependsOn: "Validate"
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
# We're only running one platform per Python version to speed up builds
|
|
||||||
Python36Linux:
|
|
||||||
imageName: "ubuntu-20.04"
|
|
||||||
python.version: "3.6"
|
|
||||||
# Python36Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.6"
|
|
||||||
# Python36Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.6"
|
|
||||||
# Python37Linux:
|
|
||||||
# imageName: "ubuntu-20.04"
|
|
||||||
# python.version: "3.7"
|
|
||||||
Python37Windows:
|
|
||||||
imageName: "windows-latest"
|
|
||||||
python.version: "3.7"
|
|
||||||
# Python37Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.7"
|
|
||||||
# Python38Linux:
|
|
||||||
# imageName: "ubuntu-latest"
|
|
||||||
# python.version: "3.8"
|
|
||||||
# Python38Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.8"
|
|
||||||
Python38Mac:
|
|
||||||
imageName: "macos-latest"
|
|
||||||
python.version: "3.8"
|
|
||||||
Python39Linux:
|
|
||||||
imageName: "ubuntu-latest"
|
|
||||||
python.version: "3.9"
|
|
||||||
# Python39Windows:
|
|
||||||
# imageName: "windows-latest"
|
|
||||||
# python.version: "3.9"
|
|
||||||
# Python39Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.9"
|
|
||||||
# Python310Linux:
|
|
||||||
# imageName: "ubuntu-latest"
|
|
||||||
# python.version: "3.10"
|
|
||||||
Python310Windows:
|
|
||||||
imageName: "windows-latest"
|
|
||||||
python.version: "3.10"
|
|
||||||
# Python310Mac:
|
|
||||||
# imageName: "macos-latest"
|
|
||||||
# python.version: "3.10"
|
|
||||||
Python311Linux:
|
|
||||||
imageName: 'ubuntu-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
Python311Windows:
|
|
||||||
imageName: 'windows-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
Python311Mac:
|
|
||||||
imageName: 'macos-latest'
|
|
||||||
python.version: '3.11'
|
|
||||||
maxParallel: 4
|
|
||||||
pool:
|
|
||||||
vmImage: $(imageName)
|
|
||||||
steps:
|
|
||||||
- template: .github/azure-steps.yml
|
|
||||||
parameters:
|
|
||||||
python_version: '$(python.version)'
|
|
|
@ -3,7 +3,4 @@ numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
|
||||||
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
|
||||||
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||||
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy>=1.25.0; python_version>='3.9'
|
||||||
numpy==1.21.3; python_version=='3.10'
|
|
||||||
numpy==1.23.2; python_version=='3.11'
|
|
||||||
numpy; python_version>='3.12'
|
|
||||||
|
|
|
@ -1,14 +1,17 @@
|
||||||
# Listeners
|
# Listeners
|
||||||
|
|
||||||
1. [Overview](#1-overview)
|
- [1. Overview](#1-overview)
|
||||||
2. [Initialization](#2-initialization)
|
- [2. Initialization](#2-initialization)
|
||||||
- [A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
- [2A. Linking listeners to the embedding component](#2a-linking-listeners-to-the-embedding-component)
|
||||||
- [B. Shape inference](#2b-shape-inference)
|
- [2B. Shape inference](#2b-shape-inference)
|
||||||
3. [Internal communication](#3-internal-communication)
|
- [3. Internal communication](#3-internal-communication)
|
||||||
- [A. During prediction](#3a-during-prediction)
|
- [3A. During prediction](#3a-during-prediction)
|
||||||
- [B. During training](#3b-during-training)
|
- [3B. During training](#3b-during-training)
|
||||||
- [C. Frozen components](#3c-frozen-components)
|
- [Training with multiple listeners](#training-with-multiple-listeners)
|
||||||
4. [Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
- [3C. Frozen components](#3c-frozen-components)
|
||||||
|
- [The Tok2Vec or Transformer is frozen](#the-tok2vec-or-transformer-is-frozen)
|
||||||
|
- [The upstream component is frozen](#the-upstream-component-is-frozen)
|
||||||
|
- [4. Replacing listener with standalone](#4-replacing-listener-with-standalone)
|
||||||
|
|
||||||
## 1. Overview
|
## 1. Overview
|
||||||
|
|
||||||
|
@ -218,3 +221,15 @@ new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
||||||
The new config and model are then properly stored on the `nlp` object.
|
The new config and model are then properly stored on the `nlp` object.
|
||||||
Note that this functionality (running the replacement for a transformer listener) was broken prior to
|
Note that this functionality (running the replacement for a transformer listener) was broken prior to
|
||||||
`spacy-transformers` 1.0.5.
|
`spacy-transformers` 1.0.5.
|
||||||
|
|
||||||
|
In spaCy 3.7, `Language.replace_listeners` was updated to pass the following additional arguments to the `replace_listener` callback:
|
||||||
|
the listener to be replaced and the `tok2vec`/`transformer` pipe from which the new model was copied. To maintain backwards-compatiblity,
|
||||||
|
the method only passes these extra arguments for callbacks that support them:
|
||||||
|
|
||||||
|
```
|
||||||
|
def replace_listener_pre_37(copied_tok2vec_model):
|
||||||
|
...
|
||||||
|
|
||||||
|
def replace_listener_post_37(copied_tok2vec_model, replaced_listener, tok2vec_pipe):
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
|
@ -6,6 +6,10 @@ requires = [
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.1.8,<8.2.0",
|
"thinc>=8.1.8,<8.2.0",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0; python_version < '3.9'",
|
||||||
|
"numpy>=1.25.0; python_version >= '3.9'",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
|
|
@ -9,11 +9,13 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.10.0
|
||||||
pathy>=0.10.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
|
weasel>=0.1.0,<0.2.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||||
|
@ -38,3 +40,5 @@ types-setuptools>=57.0.0
|
||||||
types-requests
|
types-requests
|
||||||
types-setuptools>=57.0.0
|
types-setuptools>=57.0.0
|
||||||
black==22.3.0
|
black==22.3.0
|
||||||
|
cython-lint>=0.15.0; python_version >= "3.7"
|
||||||
|
isort>=5.0,<6.0
|
||||||
|
|
13
setup.cfg
13
setup.cfg
|
@ -32,8 +32,13 @@ project_urls =
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.6
|
python_requires = >=3.6
|
||||||
|
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
||||||
|
# spaCy v4
|
||||||
setup_requires =
|
setup_requires =
|
||||||
cython>=0.25,<3.0
|
cython>=0.25,<3.0
|
||||||
|
# The newest supported pip for python 3.6 has bugs related to markers in
|
||||||
|
# this section, so this does not contain the same constraints as
|
||||||
|
# pyproject.toml
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0
|
||||||
# We also need our Cython packages here to compile against
|
# We also need our Cython packages here to compile against
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
|
@ -51,12 +56,14 @@ install_requires =
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
weasel>=0.1.0,<0.2.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.8.0
|
typer>=0.3.0,<0.10.0
|
||||||
pathy>=0.10.0
|
pathy>=0.10.0
|
||||||
smart-open>=5.2.1,<7.0.0
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
|
||||||
jinja2
|
jinja2
|
||||||
|
@ -75,8 +82,6 @@ lookups =
|
||||||
spacy_lookups_data>=1.0.3,<1.1.0
|
spacy_lookups_data>=1.0.3,<1.1.0
|
||||||
transformers =
|
transformers =
|
||||||
spacy_transformers>=1.1.2,<1.3.0
|
spacy_transformers>=1.1.2,<1.3.0
|
||||||
ray =
|
|
||||||
spacy_ray>=0.1.0,<1.0.0
|
|
||||||
cuda =
|
cuda =
|
||||||
cupy>=5.0.0b4,<13.0.0
|
cupy>=5.0.0b4,<13.0.0
|
||||||
cuda80 =
|
cuda80 =
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Union, Iterable, Dict, Any
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, Union
|
||||||
|
|
||||||
# set library-specific custom warning handling before doing anything else
|
# set library-specific custom warning handling before doing anything else
|
||||||
from .errors import setup_default_warnings
|
from .errors import setup_default_warnings
|
||||||
|
@ -8,20 +8,17 @@ from .errors import setup_default_warnings
|
||||||
setup_default_warnings() # noqa: E402
|
setup_default_warnings() # noqa: E402
|
||||||
|
|
||||||
# These are imported as part of the API
|
# These are imported as part of the API
|
||||||
from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
|
from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
|
||||||
from thinc.api import Config
|
|
||||||
|
|
||||||
from . import pipeline # noqa: F401
|
from . import pipeline # noqa: F401
|
||||||
from .cli.info import info # noqa: F401
|
|
||||||
from .glossary import explain # noqa: F401
|
|
||||||
from .about import __version__ # noqa: F401
|
|
||||||
from .util import registry, logger # noqa: F401
|
|
||||||
|
|
||||||
from .errors import Errors
|
|
||||||
from .language import Language
|
|
||||||
from .vocab import Vocab
|
|
||||||
from . import util
|
from . import util
|
||||||
|
from .about import __version__ # noqa: F401
|
||||||
|
from .cli.info import info # noqa: F401
|
||||||
|
from .errors import Errors
|
||||||
|
from .glossary import explain # noqa: F401
|
||||||
|
from .language import Language
|
||||||
|
from .util import logger, registry # noqa: F401
|
||||||
|
from .vocab import Vocab
|
||||||
|
|
||||||
if sys.maxunicode == 65535:
|
if sys.maxunicode == 65535:
|
||||||
raise SystemError(Errors.E130)
|
raise SystemError(Errors.E130)
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.5.0"
|
__version__ = "3.7.0.dev0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
|
||||||
__projects_branch__ = "v3"
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# Reserve 64 values for flag features
|
# Reserve 64 values for flag features
|
||||||
from . cimport symbols
|
from . cimport symbols
|
||||||
|
|
||||||
|
|
||||||
cdef enum attr_id_t:
|
cdef enum attr_id_t:
|
||||||
NULL_ATTR
|
NULL_ATTR
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
|
|
|
@ -117,7 +117,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
if "pos" in stringy_attrs:
|
if "pos" in stringy_attrs:
|
||||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||||
if "morph" in stringy_attrs:
|
if "morph" in stringy_attrs:
|
||||||
morphs = stringy_attrs.pop("morph")
|
morphs = stringy_attrs.pop("morph") # no-cython-lint
|
||||||
if "number" in stringy_attrs:
|
if "number" in stringy_attrs:
|
||||||
stringy_attrs.pop("number")
|
stringy_attrs.pop("number")
|
||||||
if "tenspect" in stringy_attrs:
|
if "tenspect" in stringy_attrs:
|
||||||
|
|
|
@ -1,35 +1,28 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, setup_cli # noqa: F401
|
from ._util import app, setup_cli # noqa: F401
|
||||||
|
from .apply import apply # noqa: F401
|
||||||
|
from .assemble import assemble_cli # noqa: F401
|
||||||
|
|
||||||
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
|
||||||
# are registered automatically and won't have to be imported here.
|
# are registered automatically and won't have to be imported here.
|
||||||
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
from .benchmark_speed import benchmark_speed_cli # noqa: F401
|
||||||
|
from .convert import convert # noqa: F401
|
||||||
|
from .debug_config import debug_config # noqa: F401
|
||||||
|
from .debug_data import debug_data # noqa: F401
|
||||||
|
from .debug_diff import debug_diff # noqa: F401
|
||||||
|
from .debug_model import debug_model # noqa: F401
|
||||||
from .download import download # noqa: F401
|
from .download import download # noqa: F401
|
||||||
|
from .evaluate import evaluate # noqa: F401
|
||||||
|
from .find_threshold import find_threshold # noqa: F401
|
||||||
from .info import info # noqa: F401
|
from .info import info # noqa: F401
|
||||||
|
from .init_config import fill_config, init_config # noqa: F401
|
||||||
|
from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .profile import profile # noqa: F401
|
from .profile import profile # noqa: F401
|
||||||
from .train import train_cli # noqa: F401
|
from .train import train_cli # noqa: F401
|
||||||
from .assemble import assemble_cli # noqa: F401
|
|
||||||
from .pretrain import pretrain # noqa: F401
|
|
||||||
from .debug_data import debug_data # noqa: F401
|
|
||||||
from .debug_config import debug_config # noqa: F401
|
|
||||||
from .debug_model import debug_model # noqa: F401
|
|
||||||
from .debug_diff import debug_diff # noqa: F401
|
|
||||||
from .evaluate import evaluate # noqa: F401
|
|
||||||
from .apply import apply # noqa: F401
|
|
||||||
from .convert import convert # noqa: F401
|
|
||||||
from .init_pipeline import init_pipeline_cli # noqa: F401
|
|
||||||
from .init_config import init_config, fill_config # noqa: F401
|
|
||||||
from .validate import validate # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.clone import project_clone # noqa: F401
|
|
||||||
from .project.assets import project_assets # noqa: F401
|
|
||||||
from .project.run import project_run # noqa: F401
|
|
||||||
from .project.dvc import project_update_dvc # noqa: F401
|
|
||||||
from .project.push import project_push # noqa: F401
|
|
||||||
from .project.pull import project_pull # noqa: F401
|
|
||||||
from .project.document import project_document # noqa: F401
|
|
||||||
from .find_threshold import find_threshold # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -1,26 +1,45 @@
|
||||||
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
|
|
||||||
from typing import TYPE_CHECKING, overload
|
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import srsly
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from configparser import InterpolationError
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
|
import srsly
|
||||||
import typer
|
import typer
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
from click.parser import split_arg_string
|
from click.parser import split_arg_string
|
||||||
from typer.main import get_command
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from thinc.api import Config, ConfigValidationError, require_gpu
|
from thinc.api import Config, ConfigValidationError, require_gpu
|
||||||
from thinc.util import gpu_is_available
|
from thinc.util import gpu_is_available
|
||||||
from configparser import InterpolationError
|
from typer.main import get_command
|
||||||
import os
|
from wasabi import Printer, msg
|
||||||
|
from weasel import app as project_cli
|
||||||
|
|
||||||
from ..compat import Literal
|
|
||||||
from ..schemas import ProjectConfigSchema, validate
|
|
||||||
from ..util import import_file, run_command, make_tempdir, registry, logger
|
|
||||||
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
|
|
||||||
from .. import about
|
from .. import about
|
||||||
|
from ..compat import Literal
|
||||||
|
from ..schemas import validate
|
||||||
|
from ..util import (
|
||||||
|
ENV_VARS,
|
||||||
|
SimpleFrozenDict,
|
||||||
|
import_file,
|
||||||
|
is_compatible_version,
|
||||||
|
logger,
|
||||||
|
make_tempdir,
|
||||||
|
registry,
|
||||||
|
run_command,
|
||||||
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from pathy import FluidPath # noqa: F401
|
from pathy import FluidPath # noqa: F401
|
||||||
|
@ -30,7 +49,6 @@ SDIST_SUFFIX = ".tar.gz"
|
||||||
WHEEL_SUFFIX = "-py3-none-any.whl"
|
WHEEL_SUFFIX = "-py3-none-any.whl"
|
||||||
|
|
||||||
PROJECT_FILE = "project.yml"
|
PROJECT_FILE = "project.yml"
|
||||||
PROJECT_LOCK = "project.lock"
|
|
||||||
COMMAND = "python -m spacy"
|
COMMAND = "python -m spacy"
|
||||||
NAME = "spacy"
|
NAME = "spacy"
|
||||||
HELP = """spaCy Command-line Interface
|
HELP = """spaCy Command-line Interface
|
||||||
|
@ -56,11 +74,10 @@ Opt = typer.Option
|
||||||
|
|
||||||
app = typer.Typer(name=NAME, help=HELP)
|
app = typer.Typer(name=NAME, help=HELP)
|
||||||
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
|
||||||
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
|
|
||||||
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
|
||||||
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
|
||||||
|
|
||||||
app.add_typer(project_cli)
|
app.add_typer(project_cli, name="project", help=PROJECT_HELP, no_args_is_help=True)
|
||||||
app.add_typer(debug_cli)
|
app.add_typer(debug_cli)
|
||||||
app.add_typer(benchmark_cli)
|
app.add_typer(benchmark_cli)
|
||||||
app.add_typer(init_cli)
|
app.add_typer(init_cli)
|
||||||
|
@ -135,148 +152,6 @@ def _parse_override(value: Any) -> Any:
|
||||||
return str(value)
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
def load_project_config(
|
|
||||||
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Load the project.yml file from a directory and validate it. Also make
|
|
||||||
sure that all directories defined in the config exist.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
interpolate (bool): Whether to substitute project variables.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
RETURNS (Dict[str, Any]): The loaded project.yml.
|
|
||||||
"""
|
|
||||||
config_path = path / PROJECT_FILE
|
|
||||||
if not config_path.exists():
|
|
||||||
msg.fail(f"Can't find {PROJECT_FILE}", config_path, exits=1)
|
|
||||||
invalid_err = f"Invalid {PROJECT_FILE}. Double-check that the YAML is correct."
|
|
||||||
try:
|
|
||||||
config = srsly.read_yaml(config_path)
|
|
||||||
except ValueError as e:
|
|
||||||
msg.fail(invalid_err, e, exits=1)
|
|
||||||
errors = validate(ProjectConfigSchema, config)
|
|
||||||
if errors:
|
|
||||||
msg.fail(invalid_err)
|
|
||||||
print("\n".join(errors))
|
|
||||||
sys.exit(1)
|
|
||||||
validate_project_version(config)
|
|
||||||
validate_project_commands(config)
|
|
||||||
if interpolate:
|
|
||||||
err = f"{PROJECT_FILE} validation error"
|
|
||||||
with show_validation_error(title=err, hint_fill=False):
|
|
||||||
config = substitute_project_variables(config, overrides)
|
|
||||||
# Make sure directories defined in config exist
|
|
||||||
for subdir in config.get("directories", []):
|
|
||||||
dir_path = path / subdir
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir(parents=True)
|
|
||||||
return config
|
|
||||||
|
|
||||||
|
|
||||||
def substitute_project_variables(
|
|
||||||
config: Dict[str, Any],
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
key: str = "vars",
|
|
||||||
env_key: str = "env",
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Interpolate variables in the project file using the config system.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The project config.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
key (str): Key containing variables in project config.
|
|
||||||
env_key (str): Key containing environment variable mapping in project config.
|
|
||||||
RETURNS (Dict[str, Any]): The interpolated project config.
|
|
||||||
"""
|
|
||||||
config.setdefault(key, {})
|
|
||||||
config.setdefault(env_key, {})
|
|
||||||
# Substitute references to env vars with their values
|
|
||||||
for config_var, env_var in config[env_key].items():
|
|
||||||
config[env_key][config_var] = _parse_override(os.environ.get(env_var, ""))
|
|
||||||
# Need to put variables in the top scope again so we can have a top-level
|
|
||||||
# section "project" (otherwise, a list of commands in the top scope wouldn't)
|
|
||||||
# be allowed by Thinc's config system
|
|
||||||
cfg = Config({"project": config, key: config[key], env_key: config[env_key]})
|
|
||||||
cfg = Config().from_str(cfg.to_str(), overrides=overrides)
|
|
||||||
interpolated = cfg.interpolate()
|
|
||||||
return dict(interpolated["project"])
|
|
||||||
|
|
||||||
|
|
||||||
def validate_project_version(config: Dict[str, Any]) -> None:
|
|
||||||
"""If the project defines a compatible spaCy version range, chec that it's
|
|
||||||
compatible with the current version of spaCy.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The loaded config.
|
|
||||||
"""
|
|
||||||
spacy_version = config.get("spacy_version", None)
|
|
||||||
if spacy_version and not is_compatible_version(about.__version__, spacy_version):
|
|
||||||
err = (
|
|
||||||
f"The {PROJECT_FILE} specifies a spaCy version range ({spacy_version}) "
|
|
||||||
f"that's not compatible with the version of spaCy you're running "
|
|
||||||
f"({about.__version__}). You can edit version requirement in the "
|
|
||||||
f"{PROJECT_FILE} to load it, but the project may not run as expected."
|
|
||||||
)
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_project_commands(config: Dict[str, Any]) -> None:
|
|
||||||
"""Check that project commands and workflows are valid, don't contain
|
|
||||||
duplicates, don't clash and only refer to commands that exist.
|
|
||||||
|
|
||||||
config (Dict[str, Any]): The loaded config.
|
|
||||||
"""
|
|
||||||
command_names = [cmd["name"] for cmd in config.get("commands", [])]
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
duplicates = set([cmd for cmd in command_names if command_names.count(cmd) > 1])
|
|
||||||
if duplicates:
|
|
||||||
err = f"Duplicate commands defined in {PROJECT_FILE}: {', '.join(duplicates)}"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
for workflow_name, workflow_steps in workflows.items():
|
|
||||||
if workflow_name in command_names:
|
|
||||||
err = f"Can't use workflow name '{workflow_name}': name already exists as a command"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
for step in workflow_steps:
|
|
||||||
if step not in command_names:
|
|
||||||
msg.fail(
|
|
||||||
f"Unknown command specified in workflow '{workflow_name}': {step}",
|
|
||||||
f"Workflows can only refer to commands defined in the 'commands' "
|
|
||||||
f"section of the {PROJECT_FILE}.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_hash(data, exclude: Iterable[str] = tuple()) -> str:
|
|
||||||
"""Get the hash for a JSON-serializable object.
|
|
||||||
|
|
||||||
data: The data to hash.
|
|
||||||
exclude (Iterable[str]): Top-level keys to exclude if data is a dict.
|
|
||||||
RETURNS (str): The hash.
|
|
||||||
"""
|
|
||||||
if isinstance(data, dict):
|
|
||||||
data = {k: v for k, v in data.items() if k not in exclude}
|
|
||||||
data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
|
|
||||||
return hashlib.md5(data_str).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_checksum(path: Union[Path, str]) -> str:
|
|
||||||
"""Get the checksum for a file or directory given its file path. If a
|
|
||||||
directory path is provided, this uses all files in that directory.
|
|
||||||
|
|
||||||
path (Union[Path, str]): The file or directory path.
|
|
||||||
RETURNS (str): The checksum.
|
|
||||||
"""
|
|
||||||
path = Path(path)
|
|
||||||
if not (path.is_file() or path.is_dir()):
|
|
||||||
msg.fail(f"Can't get checksum for {path}: not a file or directory", exits=1)
|
|
||||||
if path.is_file():
|
|
||||||
return hashlib.md5(Path(path).read_bytes()).hexdigest()
|
|
||||||
else:
|
|
||||||
# TODO: this is currently pretty slow
|
|
||||||
dir_checksum = hashlib.md5()
|
|
||||||
for sub_file in sorted(fp for fp in path.rglob("*") if fp.is_file()):
|
|
||||||
dir_checksum.update(sub_file.read_bytes())
|
|
||||||
return dir_checksum.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def show_validation_error(
|
def show_validation_error(
|
||||||
file_path: Optional[Union[str, Path]] = None,
|
file_path: Optional[Union[str, Path]] = None,
|
||||||
|
@ -334,166 +209,10 @@ def import_code(code_path: Optional[Union[Path, str]]) -> None:
|
||||||
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
|
||||||
|
|
||||||
|
|
||||||
def upload_file(src: Path, dest: Union[str, "FluidPath"]) -> None:
|
|
||||||
"""Upload a file.
|
|
||||||
|
|
||||||
src (Path): The source path.
|
|
||||||
url (str): The destination URL to upload to.
|
|
||||||
"""
|
|
||||||
import smart_open
|
|
||||||
|
|
||||||
# Create parent directories for local paths
|
|
||||||
if isinstance(dest, Path):
|
|
||||||
if not dest.parent.exists():
|
|
||||||
dest.parent.mkdir(parents=True)
|
|
||||||
|
|
||||||
dest = str(dest)
|
|
||||||
with smart_open.open(dest, mode="wb") as output_file:
|
|
||||||
with src.open(mode="rb") as input_file:
|
|
||||||
output_file.write(input_file.read())
|
|
||||||
|
|
||||||
|
|
||||||
def download_file(
|
|
||||||
src: Union[str, "FluidPath"], dest: Path, *, force: bool = False
|
|
||||||
) -> None:
|
|
||||||
"""Download a file using smart_open.
|
|
||||||
|
|
||||||
url (str): The URL of the file.
|
|
||||||
dest (Path): The destination path.
|
|
||||||
force (bool): Whether to force download even if file exists.
|
|
||||||
If False, the download will be skipped.
|
|
||||||
"""
|
|
||||||
import smart_open
|
|
||||||
|
|
||||||
if dest.exists() and not force:
|
|
||||||
return None
|
|
||||||
src = str(src)
|
|
||||||
with smart_open.open(src, mode="rb", compression="disable") as input_file:
|
|
||||||
with dest.open(mode="wb") as output_file:
|
|
||||||
shutil.copyfileobj(input_file, output_file)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_pathy(path):
|
|
||||||
"""Temporary helper to prevent importing Pathy globally (which can cause
|
|
||||||
slow and annoying Google Cloud warning)."""
|
|
||||||
from pathy import Pathy # noqa: F811
|
|
||||||
|
|
||||||
return Pathy.fluid(path)
|
|
||||||
|
|
||||||
|
|
||||||
def git_checkout(
|
|
||||||
repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False
|
|
||||||
):
|
|
||||||
git_version = get_git_version()
|
|
||||||
if dest.exists():
|
|
||||||
msg.fail("Destination of checkout must not exist", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
msg.fail("Parent of destination of checkout must exist", exits=1)
|
|
||||||
if sparse and git_version >= (2, 22):
|
|
||||||
return git_sparse_checkout(repo, subpath, dest, branch)
|
|
||||||
elif sparse:
|
|
||||||
# Only show warnings if the user explicitly wants sparse checkout but
|
|
||||||
# the Git version doesn't support it
|
|
||||||
err_old = (
|
|
||||||
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
|
|
||||||
f"that doesn't fully support sparse checkout yet."
|
|
||||||
)
|
|
||||||
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
|
|
||||||
msg.warn(
|
|
||||||
f"{err_unk if git_version == (0, 0) else err_old} "
|
|
||||||
f"This means that more files than necessary may be downloaded "
|
|
||||||
f"temporarily. To only download the files needed, make sure "
|
|
||||||
f"you're using Git v2.22 or above."
|
|
||||||
)
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
# We need Path(name) to make sure we also support subdirectories
|
|
||||||
try:
|
|
||||||
source_path = tmp_dir / Path(subpath)
|
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
shutil.copytree(str(source_path), str(dest))
|
|
||||||
except FileNotFoundError:
|
|
||||||
err = f"Can't clone {subpath}. Make sure the directory exists in the repo (branch '{branch}')"
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
|
|
||||||
|
|
||||||
def git_sparse_checkout(repo, subpath, dest, branch):
|
|
||||||
# We're using Git, partial clone and sparse checkout to
|
|
||||||
# only clone the files we need
|
|
||||||
# This ends up being RIDICULOUS. omg.
|
|
||||||
# So, every tutorial and SO post talks about 'sparse checkout'...But they
|
|
||||||
# go and *clone* the whole repo. Worthless. And cloning part of a repo
|
|
||||||
# turns out to be completely broken. The only way to specify a "path" is..
|
|
||||||
# a path *on the server*? The contents of which, specifies the paths. Wat.
|
|
||||||
# Obviously this is hopelessly broken and insecure, because you can query
|
|
||||||
# arbitrary paths on the server! So nobody enables this.
|
|
||||||
# What we have to do is disable *all* files. We could then just checkout
|
|
||||||
# the path, and it'd "work", but be hopelessly slow...Because it goes and
|
|
||||||
# transfers every missing object one-by-one. So the final piece is that we
|
|
||||||
# need to use some weird git internals to fetch the missings in bulk, and
|
|
||||||
# *that* we can do by path.
|
|
||||||
# We're using Git and sparse checkout to only clone the files we need
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
# This is the "clone, but don't download anything" part.
|
|
||||||
cmd = (
|
|
||||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
|
||||||
f"-b {branch} --filter=blob:none"
|
|
||||||
)
|
|
||||||
run_command(cmd)
|
|
||||||
# Now we need to find the missing filenames for the subpath we want.
|
|
||||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
|
||||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
|
||||||
ret = run_command(cmd, capture=True)
|
|
||||||
git_repo = _http_to_git(repo)
|
|
||||||
# Now pass those missings into another bit of git internals
|
|
||||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
|
||||||
if not missings:
|
|
||||||
err = (
|
|
||||||
f"Could not find any relevant files for '{subpath}'. "
|
|
||||||
f"Did you specify a correct and complete path within repo '{repo}' "
|
|
||||||
f"and branch {branch}?"
|
|
||||||
)
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
# And finally, we can checkout our subpath
|
|
||||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
|
||||||
run_command(cmd, capture=True)
|
|
||||||
|
|
||||||
# Get a subdirectory of the cloned path, if appropriate
|
|
||||||
source_path = tmp_dir / Path(subpath)
|
|
||||||
if not is_subpath_of(tmp_dir, source_path):
|
|
||||||
err = f"'{subpath}' is a path outside of the cloned repository."
|
|
||||||
msg.fail(err, repo, exits=1)
|
|
||||||
|
|
||||||
shutil.move(str(source_path), str(dest))
|
|
||||||
|
|
||||||
|
|
||||||
def git_repo_branch_exists(repo: str, branch: str) -> bool:
|
|
||||||
"""Uses 'git ls-remote' to check if a repository and branch exists
|
|
||||||
|
|
||||||
repo (str): URL to get repo.
|
|
||||||
branch (str): Branch on repo to check.
|
|
||||||
RETURNS (bool): True if repo:branch exists.
|
|
||||||
"""
|
|
||||||
get_git_version()
|
|
||||||
cmd = f"git ls-remote {repo} {branch}"
|
|
||||||
# We might be tempted to use `--exit-code` with `git ls-remote`, but
|
|
||||||
# `run_command` handles the `returncode` for us, so we'll rely on
|
|
||||||
# the fact that stdout returns '' if the requested branch doesn't exist
|
|
||||||
ret = run_command(cmd, capture=True)
|
|
||||||
exists = ret.stdout != ""
|
|
||||||
return exists
|
|
||||||
|
|
||||||
|
|
||||||
def get_git_version(
|
def get_git_version(
|
||||||
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
error: str = "Could not run 'git'. Make sure it's installed and the executable is available.",
|
||||||
) -> Tuple[int, int]:
|
) -> Tuple[int, int]:
|
||||||
"""Get the version of git and raise an error if calling 'git --version' fails.
|
"""Get the version of git and raise an error if calling 'git --version' fails.
|
||||||
|
|
||||||
error (str): The error message to show.
|
error (str): The error message to show.
|
||||||
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
|
||||||
(0, 0) if the version couldn't be determined.
|
(0, 0) if the version couldn't be determined.
|
||||||
|
@ -509,30 +228,6 @@ def get_git_version(
|
||||||
return int(version[0]), int(version[1])
|
return int(version[0]), int(version[1])
|
||||||
|
|
||||||
|
|
||||||
def _http_to_git(repo: str) -> str:
|
|
||||||
if repo.startswith("http://"):
|
|
||||||
repo = repo.replace(r"http://", r"https://")
|
|
||||||
if repo.startswith(r"https://"):
|
|
||||||
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
|
||||||
if repo.endswith("/"):
|
|
||||||
repo = repo[:-1]
|
|
||||||
repo = f"{repo}.git"
|
|
||||||
return repo
|
|
||||||
|
|
||||||
|
|
||||||
def is_subpath_of(parent, child):
|
|
||||||
"""
|
|
||||||
Check whether `child` is a path contained within `parent`.
|
|
||||||
"""
|
|
||||||
# Based on https://stackoverflow.com/a/37095733 .
|
|
||||||
|
|
||||||
# In Python 3.9, the `Path.is_relative_to()` method will supplant this, so
|
|
||||||
# we can stop using crusty old os.path functions.
|
|
||||||
parent_realpath = os.path.realpath(parent)
|
|
||||||
child_realpath = os.path.realpath(child)
|
|
||||||
return os.path.commonpath([parent_realpath, child_realpath]) == parent_realpath
|
|
||||||
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
def string_to_list(value: str, intify: Literal[False] = ...) -> List[str]:
|
||||||
...
|
...
|
||||||
|
|
|
@ -1,18 +1,15 @@
|
||||||
import tqdm
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, Iterable, cast, Union
|
from typing import Iterable, List, Optional, Union, cast
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
import tqdm
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
|
|
||||||
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..vocab import Vocab
|
|
||||||
from ..util import ensure_path, load_model
|
from ..util import ensure_path, load_model
|
||||||
|
from ..vocab import Vocab
|
||||||
|
from ._util import Arg, Opt, app, import_code, setup_gpu, walk_directory
|
||||||
|
|
||||||
path_help = """Location of the documents to predict on.
|
path_help = """Location of the documents to predict on.
|
||||||
Can be a single file in .spacy format or a .jsonl file.
|
Can be a single file in .spacy format or a .jsonl file.
|
||||||
|
|
|
@ -1,13 +1,20 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..util import get_sourced_components, load_model_from_config
|
from ..util import get_sourced_components, load_model_from_config
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
from typing import Iterable, List, Optional
|
|
||||||
import random
|
import random
|
||||||
from itertools import islice
|
|
||||||
import numpy
|
|
||||||
from pathlib import Path
|
|
||||||
import time
|
import time
|
||||||
from tqdm import tqdm
|
from itertools import islice
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
|
import numpy
|
||||||
import typer
|
import typer
|
||||||
|
from tqdm import tqdm
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
|
@ -1,18 +1,22 @@
|
||||||
from typing import Callable, Iterable, Mapping, Optional, Any, Union
|
import itertools
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer
|
|
||||||
import srsly
|
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
from enum import Enum
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, Iterable, Mapping, Optional, Union
|
||||||
|
|
||||||
|
import srsly
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, walk_directory
|
|
||||||
from ..training import docs_to_json
|
|
||||||
from ..tokens import Doc, DocBin
|
from ..tokens import Doc, DocBin
|
||||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
|
from ..training import docs_to_json
|
||||||
from ..training.converters import conllu_to_docs
|
from ..training.converters import (
|
||||||
|
conll_ner_to_docs,
|
||||||
|
conllu_to_docs,
|
||||||
|
iob_to_docs,
|
||||||
|
json_to_docs,
|
||||||
|
)
|
||||||
|
from ._util import Arg, Opt, app, walk_directory
|
||||||
|
|
||||||
# Converters are matched by file extension except for ner/iob, which are
|
# Converters are matched by file extension except for ner/iob, which are
|
||||||
# matched by file extension and content. To add a converter, add a new
|
# matched by file extension and content. To add a converter, add a new
|
||||||
|
|
|
@ -1,15 +1,22 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg, table
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
|
import typer
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
from thinc.config import VARIABLE_RE
|
from thinc.config import VARIABLE_RE
|
||||||
import typer
|
from wasabi import msg, table
|
||||||
|
|
||||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
from .. import util
|
||||||
from ._util import import_code, debug_cli
|
|
||||||
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
debug_cli,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,31 +1,49 @@
|
||||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
|
|
||||||
from typing import cast, overload
|
|
||||||
from pathlib import Path
|
|
||||||
from collections import Counter
|
|
||||||
import sys
|
|
||||||
import srsly
|
|
||||||
from wasabi import Printer, MESSAGES, msg
|
|
||||||
import typer
|
|
||||||
import math
|
import math
|
||||||
import numpy
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
Any,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Set,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
cast,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
import numpy
|
||||||
from ._util import import_code, debug_cli, _format_number
|
import srsly
|
||||||
from ..training import Example, remove_bilu_prefix
|
import typer
|
||||||
from ..training.initialize import get_sourced_components
|
from wasabi import MESSAGES, Printer, msg
|
||||||
from ..schemas import ConfigSchemaTraining
|
|
||||||
from ..pipeline import TrainablePipe
|
from .. import util
|
||||||
|
from ..compat import Literal
|
||||||
|
from ..language import Language
|
||||||
|
from ..morphology import Morphology
|
||||||
|
from ..pipeline import Morphologizer, SpanCategorizer, TrainablePipe
|
||||||
|
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||||
from ..pipeline._parser_internals import nonproj
|
from ..pipeline._parser_internals import nonproj
|
||||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||||
from ..pipeline import Morphologizer, SpanCategorizer
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
from ..training import Example, remove_bilu_prefix
|
||||||
from ..morphology import Morphology
|
from ..training.initialize import get_sourced_components
|
||||||
from ..language import Language
|
|
||||||
from ..util import registry, resolve_dot_names
|
from ..util import registry, resolve_dot_names
|
||||||
from ..compat import Literal
|
|
||||||
from ..vectors import Mode as VectorsMode
|
from ..vectors import Mode as VectorsMode
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
_format_number,
|
||||||
|
app,
|
||||||
|
debug_cli,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
# Minimum number of expected occurrences of NER label in data to train new label
|
# Minimum number of expected occurrences of NER label in data to train new label
|
||||||
NEW_LABEL_THRESHOLD = 50
|
NEW_LABEL_THRESHOLD = 50
|
||||||
|
@ -212,7 +230,7 @@ def debug_data(
|
||||||
else:
|
else:
|
||||||
msg.info("No word vectors present in the package")
|
msg.info("No word vectors present in the package")
|
||||||
|
|
||||||
if "spancat" in factory_names:
|
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||||
model_labels_spancat = _get_labels_from_spancat(nlp)
|
model_labels_spancat = _get_labels_from_spancat(nlp)
|
||||||
has_low_data_warning = False
|
has_low_data_warning = False
|
||||||
has_no_neg_warning = False
|
has_no_neg_warning = False
|
||||||
|
@ -830,7 +848,7 @@ def _compile_gold(
|
||||||
data["boundary_cross_ents"] += 1
|
data["boundary_cross_ents"] += 1
|
||||||
elif label == "-":
|
elif label == "-":
|
||||||
data["ner"]["-"] += 1
|
data["ner"]["-"] += 1
|
||||||
if "spancat" in factory_names:
|
if "spancat" in factory_names or "spancat_singlelabel" in factory_names:
|
||||||
for spans_key in list(eg.reference.spans.keys()):
|
for spans_key in list(eg.reference.spans.keys()):
|
||||||
# Obtain the span frequency
|
# Obtain the span frequency
|
||||||
if spans_key not in data["spancat"]:
|
if spans_key not in data["spancat"]:
|
||||||
|
@ -1028,7 +1046,7 @@ def _get_labels_from_spancat(nlp: Language) -> Dict[str, Set[str]]:
|
||||||
pipe_names = [
|
pipe_names = [
|
||||||
pipe_name
|
pipe_name
|
||||||
for pipe_name in nlp.pipe_names
|
for pipe_name in nlp.pipe_names
|
||||||
if nlp.get_pipe_meta(pipe_name).factory == "spancat"
|
if nlp.get_pipe_meta(pipe_name).factory in ("spancat", "spancat_singlelabel")
|
||||||
]
|
]
|
||||||
labels: Dict[str, Set[str]] = {}
|
labels: Dict[str, Set[str]] = {}
|
||||||
for pipe_name in pipe_names:
|
for pipe_name in pipe_names:
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from wasabi import Printer, diff_strings, MarkdownRenderer
|
|
||||||
from pathlib import Path
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
from wasabi import MarkdownRenderer, Printer, diff_strings
|
||||||
|
|
||||||
from ._util import debug_cli, Arg, Opt, show_validation_error, parse_config_overrides
|
|
||||||
from ..util import load_config
|
from ..util import load_config
|
||||||
from .init_config import init_config, Optimizations
|
from ._util import Arg, Opt, debug_cli, parse_config_overrides, show_validation_error
|
||||||
|
from .init_config import Optimizations, init_config
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,19 +1,32 @@
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
import itertools
|
import itertools
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from thinc.api import (
|
||||||
|
Model,
|
||||||
|
data_validation,
|
||||||
|
fix_random_seed,
|
||||||
|
set_dropout_rate,
|
||||||
|
set_gpu_allocator,
|
||||||
|
)
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy.util import resolve_dot_names
|
from spacy.util import resolve_dot_names
|
||||||
from wasabi import msg
|
|
||||||
from thinc.api import fix_random_seed, set_dropout_rate
|
|
||||||
from thinc.api import Model, data_validation, set_gpu_allocator
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ._util import Arg, Opt, debug_cli, show_validation_error
|
from .. import util
|
||||||
from ._util import parse_config_overrides, string_to_list, setup_gpu
|
|
||||||
from ..schemas import ConfigSchemaTraining
|
from ..schemas import ConfigSchemaTraining
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from .. import util
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
debug_cli,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
string_to_list,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command(
|
@debug_cli.command(
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, Sequence
|
|
||||||
import requests
|
|
||||||
import sys
|
import sys
|
||||||
from wasabi import msg
|
from typing import Optional, Sequence
|
||||||
import typer
|
|
||||||
|
import requests
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import is_package, get_minor_version, run_command
|
|
||||||
from ..util import is_prerelease_version
|
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
|
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
|
||||||
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -81,11 +81,8 @@ def download(
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||||
egg_tpl = "#egg={m}=={v}"
|
|
||||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||||
if sdist:
|
|
||||||
filename += egg_tpl.format(m=model_name, v=version)
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,16 @@
|
||||||
from typing import Optional, List, Dict, Any, Union
|
|
||||||
from wasabi import Printer
|
|
||||||
from pathlib import Path
|
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import fix_random_seed
|
from thinc.api import fix_random_seed
|
||||||
|
from wasabi import Printer
|
||||||
|
|
||||||
from ..training import Corpus
|
from .. import displacy, util
|
||||||
from ..tokens import Doc
|
|
||||||
from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from .. import util
|
from ..tokens import Doc
|
||||||
from .. import displacy
|
from ..training import Corpus
|
||||||
|
from ._util import Arg, Opt, app, benchmark_cli, import_code, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
@benchmark_cli.command(
|
@benchmark_cli.command(
|
||||||
|
@ -27,6 +27,7 @@ def evaluate_cli(
|
||||||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||||
|
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -50,6 +51,7 @@ def evaluate_cli(
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
displacy_path=displacy_path,
|
displacy_path=displacy_path,
|
||||||
displacy_limit=displacy_limit,
|
displacy_limit=displacy_limit,
|
||||||
|
per_component=per_component,
|
||||||
silent=False,
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -64,6 +66,7 @@ def evaluate(
|
||||||
displacy_limit: int = 25,
|
displacy_limit: int = 25,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
spans_key: str = "sc",
|
spans_key: str = "sc",
|
||||||
|
per_component: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
msg = Printer(no_print=silent, pretty=not silent)
|
msg = Printer(no_print=silent, pretty=not silent)
|
||||||
fix_random_seed()
|
fix_random_seed()
|
||||||
|
@ -78,7 +81,16 @@ def evaluate(
|
||||||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_dataset = list(corpus(nlp))
|
dev_dataset = list(corpus(nlp))
|
||||||
scores = nlp.evaluate(dev_dataset)
|
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||||
|
if per_component:
|
||||||
|
data = scores
|
||||||
|
if output is None:
|
||||||
|
msg.warn(
|
||||||
|
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
msg.info("Per-component scores will be saved to output JSON file.")
|
||||||
|
else:
|
||||||
metrics = {
|
metrics = {
|
||||||
"TOK": "token_acc",
|
"TOK": "token_acc",
|
||||||
"TAG": "tag_acc",
|
"TAG": "tag_acc",
|
||||||
|
@ -122,6 +134,8 @@ def evaluate(
|
||||||
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
||||||
render_deps = "parser" in factory_names
|
render_deps = "parser" in factory_names
|
||||||
render_ents = "ner" in factory_names
|
render_ents = "ner" in factory_names
|
||||||
|
render_spans = "spancat" in factory_names
|
||||||
|
|
||||||
render_parses(
|
render_parses(
|
||||||
docs,
|
docs,
|
||||||
displacy_path,
|
displacy_path,
|
||||||
|
@ -129,6 +143,7 @@ def evaluate(
|
||||||
limit=displacy_limit,
|
limit=displacy_limit,
|
||||||
deps=render_deps,
|
deps=render_deps,
|
||||||
ents=render_ents,
|
ents=render_ents,
|
||||||
|
spans=render_spans,
|
||||||
)
|
)
|
||||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||||
|
|
||||||
|
@ -182,6 +197,7 @@ def render_parses(
|
||||||
limit: int = 250,
|
limit: int = 250,
|
||||||
deps: bool = True,
|
deps: bool = True,
|
||||||
ents: bool = True,
|
ents: bool = True,
|
||||||
|
spans: bool = True,
|
||||||
):
|
):
|
||||||
docs[0].user_data["title"] = model_name
|
docs[0].user_data["title"] = model_name
|
||||||
if ents:
|
if ents:
|
||||||
|
@ -195,6 +211,11 @@ def render_parses(
|
||||||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||||
file_.write(html)
|
file_.write(html)
|
||||||
|
|
||||||
|
if spans:
|
||||||
|
html = displacy.render(docs[:limit], style="span", page=True)
|
||||||
|
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
|
||||||
|
file_.write(html)
|
||||||
|
|
||||||
|
|
||||||
def print_prf_per_type(
|
def print_prf_per_type(
|
||||||
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import functools
|
import functools
|
||||||
|
import logging
|
||||||
import operator
|
import operator
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import logging
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
from typing import Optional, Tuple, Any, Dict, List
|
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import wasabi.tables
|
import wasabi.tables
|
||||||
|
|
||||||
from ..pipeline import TextCategorizer, MultiLabel_TextCategorizer
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..training import Corpus
|
|
||||||
from ._util import app, Arg, Opt, import_code, setup_gpu
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..pipeline import MultiLabel_TextCategorizer, TextCategorizer
|
||||||
|
from ..training import Corpus
|
||||||
|
from ._util import Arg, Opt, app, import_code, setup_gpu
|
||||||
|
|
||||||
_DEFAULTS = {
|
_DEFAULTS = {
|
||||||
"n_trials": 11,
|
"n_trials": 11,
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
from typing import Optional, Dict, Any, Union, List
|
|
||||||
import platform
|
|
||||||
import json
|
import json
|
||||||
|
import platform
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, MarkdownRenderer
|
from typing import Any, Dict, List, Optional, Union
|
||||||
import srsly
|
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list
|
import srsly
|
||||||
from .download import get_model_filename, get_latest_version
|
from wasabi import MarkdownRenderer, Printer
|
||||||
from .. import util
|
|
||||||
from .. import about
|
from .. import about, util
|
||||||
from ..compat import importlib_metadata
|
from ..compat import importlib_metadata
|
||||||
|
from ._util import Arg, Opt, app, string_to_list
|
||||||
|
from .download import get_latest_version, get_model_filename
|
||||||
|
|
||||||
|
|
||||||
@app.command("info")
|
@app.command("info")
|
||||||
|
|
|
@ -1,19 +1,26 @@
|
||||||
from typing import Optional, List, Tuple
|
import re
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, diff_strings
|
from typing import List, Optional, Tuple
|
||||||
from thinc.api import Config
|
|
||||||
import srsly
|
import srsly
|
||||||
import re
|
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
|
from thinc.api import Config
|
||||||
|
from wasabi import Printer, diff_strings
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
|
||||||
from ..schemas import RecommendationSchema
|
from ..schemas import RecommendationSchema
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
|
from ._util import (
|
||||||
from ._util import string_to_list, import_code
|
COMMAND,
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
import_code,
|
||||||
|
init_cli,
|
||||||
|
show_validation_error,
|
||||||
|
string_to_list,
|
||||||
|
)
|
||||||
|
|
||||||
ROOT = Path(__file__).parent / "templates"
|
ROOT = Path(__file__).parent / "templates"
|
||||||
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
TEMPLATE_PATH = ROOT / "quickstart_training.jinja"
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
from typing import Optional
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from typing import Optional
|
||||||
import typer
|
|
||||||
import srsly
|
import srsly
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.initialize import init_nlp, convert_vectors
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
|
from ..training.initialize import convert_vectors, init_nlp
|
||||||
from ._util import import_code, setup_gpu
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
import_code,
|
||||||
|
init_cli,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@init_cli.command("vectors")
|
@init_cli.command("vectors")
|
||||||
|
@ -24,6 +32,7 @@ def init_vectors_cli(
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||||
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
|
||||||
|
attr: str = Opt("ORTH", "--attr", "-a", help="Optional token attribute to use for vectors, e.g. LOWER or NORM"),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
"""Convert word vectors for use with spaCy. Will export an nlp object that
|
||||||
|
@ -42,6 +51,7 @@ def init_vectors_cli(
|
||||||
prune=prune,
|
prune=prune,
|
||||||
name=name,
|
name=name,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
|
attr=attr,
|
||||||
)
|
)
|
||||||
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")
|
||||||
nlp.to_disk(output_dir)
|
nlp.to_disk(output_dir)
|
||||||
|
|
|
@ -1,18 +1,18 @@
|
||||||
from typing import Optional, Union, Any, Dict, List, Tuple, cast
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import Printer, MarkdownRenderer, get_raw_input
|
|
||||||
from thinc.api import Config
|
|
||||||
from collections import defaultdict
|
|
||||||
from catalogue import RegistryError
|
|
||||||
import srsly
|
|
||||||
import sys
|
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, string_to_list, WHEEL_SUFFIX, SDIST_SUFFIX
|
import srsly
|
||||||
from ..schemas import validate, ModelMetaSchema
|
from catalogue import RegistryError
|
||||||
from .. import util
|
from thinc.api import Config
|
||||||
from .. import about
|
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||||
|
|
||||||
|
from .. import about, util
|
||||||
|
from ..schemas import ModelMetaSchema, validate
|
||||||
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||||
|
|
||||||
|
|
||||||
@app.command("package")
|
@app.command("package")
|
||||||
|
|
|
@ -1,13 +1,21 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code, setup_gpu
|
|
||||||
from ..training.pretrain import pretrain
|
from ..training.pretrain import pretrain
|
||||||
from ..util import load_config
|
from ..util import load_config
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
from typing import Optional, Sequence, Union, Iterator
|
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
|
||||||
import srsly
|
|
||||||
import cProfile
|
import cProfile
|
||||||
|
import itertools
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
import itertools
|
from pathlib import Path
|
||||||
from wasabi import msg, Printer
|
from typing import Iterator, Optional, Sequence, Union
|
||||||
import typer
|
|
||||||
|
import srsly
|
||||||
|
import tqdm
|
||||||
|
import typer
|
||||||
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ._util import app, debug_cli, Arg, Opt, NAME
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..util import load_model
|
from ..util import load_model
|
||||||
|
from ._util import NAME, Arg, Opt, app, debug_cli
|
||||||
|
|
||||||
|
|
||||||
@debug_cli.command("profile")
|
@debug_cli.command("profile")
|
||||||
|
|
|
@ -1,206 +0,0 @@
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import shutil
|
|
||||||
import requests
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ...util import ensure_path, working_dir
|
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
|
||||||
from .._util import get_checksum, download_file, git_checkout, get_git_version
|
|
||||||
from .._util import SimpleFrozenDict, parse_config_overrides
|
|
||||||
|
|
||||||
# Whether assets are extra if `extra` is not set.
|
|
||||||
EXTRA_DEFAULT = False
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"assets",
|
|
||||||
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
|
||||||
)
|
|
||||||
def project_assets_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse checkout for assets provided via Git, to only check out and clone the files needed. Requires Git v22.2+."),
|
|
||||||
extra: bool = Opt(False, "--extra", "-e", help="Download all assets, including those marked as 'extra'.")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Fetch project assets like datasets and pretrained weights. Assets are
|
|
||||||
defined in the "assets" section of the project.yml. If a checksum is
|
|
||||||
provided in the project.yml, the file is only downloaded if no local file
|
|
||||||
with the same checksum exists.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-assets
|
|
||||||
"""
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
|
||||||
project_assets(
|
|
||||||
project_dir,
|
|
||||||
overrides=overrides,
|
|
||||||
sparse_checkout=sparse_checkout,
|
|
||||||
extra=extra,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def project_assets(
|
|
||||||
project_dir: Path,
|
|
||||||
*,
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
sparse_checkout: bool = False,
|
|
||||||
extra: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Fetch assets for a project using DVC if possible.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files
|
|
||||||
needed.
|
|
||||||
extra (bool): Whether to download all assets, including those marked as 'extra'.
|
|
||||||
"""
|
|
||||||
project_path = ensure_path(project_dir)
|
|
||||||
config = load_project_config(project_path, overrides=overrides)
|
|
||||||
assets = [
|
|
||||||
asset
|
|
||||||
for asset in config.get("assets", [])
|
|
||||||
if extra or not asset.get("extra", EXTRA_DEFAULT)
|
|
||||||
]
|
|
||||||
if not assets:
|
|
||||||
msg.warn(
|
|
||||||
f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)",
|
|
||||||
exits=0,
|
|
||||||
)
|
|
||||||
msg.info(f"Fetching {len(assets)} asset(s)")
|
|
||||||
|
|
||||||
for asset in assets:
|
|
||||||
dest = (project_dir / asset["dest"]).resolve()
|
|
||||||
checksum = asset.get("checksum")
|
|
||||||
if "git" in asset:
|
|
||||||
git_err = (
|
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
|
||||||
f"Make sure it's installed and that the executable is available."
|
|
||||||
)
|
|
||||||
get_git_version(error=git_err)
|
|
||||||
if dest.exists():
|
|
||||||
# If there's already a file, check for checksum
|
|
||||||
if checksum and checksum == get_checksum(dest):
|
|
||||||
msg.good(
|
|
||||||
f"Skipping download with matching checksum: {asset['dest']}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
if dest.is_dir():
|
|
||||||
shutil.rmtree(dest)
|
|
||||||
else:
|
|
||||||
dest.unlink()
|
|
||||||
if "repo" not in asset["git"] or asset["git"]["repo"] is None:
|
|
||||||
msg.fail(
|
|
||||||
"A git asset must include 'repo', the repository address.", exits=1
|
|
||||||
)
|
|
||||||
if "path" not in asset["git"] or asset["git"]["path"] is None:
|
|
||||||
msg.fail(
|
|
||||||
"A git asset must include 'path' - use \"\" to get the entire repository.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
git_checkout(
|
|
||||||
asset["git"]["repo"],
|
|
||||||
asset["git"]["path"],
|
|
||||||
dest,
|
|
||||||
branch=asset["git"].get("branch"),
|
|
||||||
sparse=sparse_checkout,
|
|
||||||
)
|
|
||||||
msg.good(f"Downloaded asset {dest}")
|
|
||||||
else:
|
|
||||||
url = asset.get("url")
|
|
||||||
if not url:
|
|
||||||
# project.yml defines asset without URL that the user has to place
|
|
||||||
check_private_asset(dest, checksum)
|
|
||||||
continue
|
|
||||||
fetch_asset(project_path, url, dest, checksum)
|
|
||||||
|
|
||||||
|
|
||||||
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
|
|
||||||
"""Check and validate assets without a URL (private assets that the user
|
|
||||||
has to provide themselves) and give feedback about the checksum.
|
|
||||||
|
|
||||||
dest (Path): Destination path of the asset.
|
|
||||||
checksum (Optional[str]): Optional checksum of the expected file.
|
|
||||||
"""
|
|
||||||
if not Path(dest).exists():
|
|
||||||
err = f"No URL provided for asset. You need to add this file yourself: {dest}"
|
|
||||||
msg.warn(err)
|
|
||||||
else:
|
|
||||||
if not checksum:
|
|
||||||
msg.good(f"Asset already exists: {dest}")
|
|
||||||
elif checksum == get_checksum(dest):
|
|
||||||
msg.good(f"Asset exists with matching checksum: {dest}")
|
|
||||||
else:
|
|
||||||
msg.fail(f"Asset available but with incorrect checksum: {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_asset(
|
|
||||||
project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
|
|
||||||
) -> None:
|
|
||||||
"""Fetch an asset from a given URL or path. If a checksum is provided and a
|
|
||||||
local file exists, it's only re-downloaded if the checksum doesn't match.
|
|
||||||
|
|
||||||
project_path (Path): Path to project directory.
|
|
||||||
url (str): URL or path to asset.
|
|
||||||
checksum (Optional[str]): Optional expected checksum of local file.
|
|
||||||
RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
|
|
||||||
the asset failed.
|
|
||||||
"""
|
|
||||||
dest_path = (project_path / dest).resolve()
|
|
||||||
if dest_path.exists():
|
|
||||||
# If there's already a file, check for checksum
|
|
||||||
if checksum:
|
|
||||||
if checksum == get_checksum(dest_path):
|
|
||||||
msg.good(f"Skipping download with matching checksum: {dest}")
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
# If there's not a checksum, make sure the file is a possibly valid size
|
|
||||||
if os.path.getsize(dest_path) == 0:
|
|
||||||
msg.warn(f"Asset exists but with size of 0 bytes, deleting: {dest}")
|
|
||||||
os.remove(dest_path)
|
|
||||||
# We might as well support the user here and create parent directories in
|
|
||||||
# case the asset dir isn't listed as a dir to create in the project.yml
|
|
||||||
if not dest_path.parent.exists():
|
|
||||||
dest_path.parent.mkdir(parents=True)
|
|
||||||
with working_dir(project_path):
|
|
||||||
url = convert_asset_url(url)
|
|
||||||
try:
|
|
||||||
download_file(url, dest_path)
|
|
||||||
msg.good(f"Downloaded asset {dest}")
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
if Path(url).exists() and Path(url).is_file():
|
|
||||||
# If it's a local file, copy to destination
|
|
||||||
shutil.copy(url, str(dest_path))
|
|
||||||
msg.good(f"Copied local asset {dest}")
|
|
||||||
else:
|
|
||||||
msg.fail(f"Download failed: {dest}", e)
|
|
||||||
if checksum and checksum != get_checksum(dest_path):
|
|
||||||
msg.fail(f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def convert_asset_url(url: str) -> str:
|
|
||||||
"""Check and convert the asset URL if needed.
|
|
||||||
|
|
||||||
url (str): The asset URL.
|
|
||||||
RETURNS (str): The converted URL.
|
|
||||||
"""
|
|
||||||
# If the asset URL is a regular GitHub URL it's likely a mistake
|
|
||||||
if (
|
|
||||||
re.match(r"(http(s?)):\/\/github.com", url)
|
|
||||||
and "releases/download" not in url
|
|
||||||
and "/raw/" not in url
|
|
||||||
):
|
|
||||||
converted = url.replace("github.com", "raw.githubusercontent.com")
|
|
||||||
converted = re.sub(r"/(tree|blob)/", "/", converted)
|
|
||||||
msg.warn(
|
|
||||||
"Downloading from a regular GitHub URL. This will only download "
|
|
||||||
"the source of the page, not the actual file. Converting the URL "
|
|
||||||
"to a raw URL.",
|
|
||||||
converted,
|
|
||||||
)
|
|
||||||
return converted
|
|
||||||
return url
|
|
|
@ -1,115 +0,0 @@
|
||||||
from typing import Optional
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import subprocess
|
|
||||||
import re
|
|
||||||
|
|
||||||
from ... import about
|
|
||||||
from ...util import ensure_path
|
|
||||||
from .._util import project_cli, Arg, Opt, COMMAND, PROJECT_FILE
|
|
||||||
from .._util import git_checkout, get_git_version, git_repo_branch_exists
|
|
||||||
|
|
||||||
DEFAULT_REPO = about.__projects__
|
|
||||||
DEFAULT_PROJECTS_BRANCH = about.__projects_branch__
|
|
||||||
DEFAULT_BRANCHES = ["main", "master"]
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("clone")
|
|
||||||
def project_clone_cli(
|
|
||||||
# fmt: off
|
|
||||||
name: str = Arg(..., help="The name of the template to clone"),
|
|
||||||
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
|
|
||||||
repo: str = Opt(DEFAULT_REPO, "--repo", "-r", help="The repository to clone from"),
|
|
||||||
branch: Optional[str] = Opt(None, "--branch", "-b", help=f"The branch to clone from. If not provided, will attempt {', '.join(DEFAULT_BRANCHES)}"),
|
|
||||||
sparse_checkout: bool = Opt(False, "--sparse", "-S", help="Use sparse Git checkout to only check out and clone the files needed. Requires Git v22.2+.")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Clone a project template from a repository. Calls into "git" and will
|
|
||||||
only download the files from the given subdirectory. The GitHub repo
|
|
||||||
defaults to the official spaCy template repo, but can be customized
|
|
||||||
(including using a private repo).
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-clone
|
|
||||||
"""
|
|
||||||
if dest is None:
|
|
||||||
dest = Path.cwd() / Path(name).parts[-1]
|
|
||||||
if repo == DEFAULT_REPO and branch is None:
|
|
||||||
branch = DEFAULT_PROJECTS_BRANCH
|
|
||||||
|
|
||||||
if branch is None:
|
|
||||||
for default_branch in DEFAULT_BRANCHES:
|
|
||||||
if git_repo_branch_exists(repo, default_branch):
|
|
||||||
branch = default_branch
|
|
||||||
break
|
|
||||||
if branch is None:
|
|
||||||
default_branches_msg = ", ".join(f"'{b}'" for b in DEFAULT_BRANCHES)
|
|
||||||
msg.fail(
|
|
||||||
"No branch provided and attempted default "
|
|
||||||
f"branches {default_branches_msg} do not exist.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if not git_repo_branch_exists(repo, branch):
|
|
||||||
msg.fail(f"repo: {repo} (branch: {branch}) does not exist.", exits=1)
|
|
||||||
assert isinstance(branch, str)
|
|
||||||
project_clone(name, dest, repo=repo, branch=branch, sparse_checkout=sparse_checkout)
|
|
||||||
|
|
||||||
|
|
||||||
def project_clone(
|
|
||||||
name: str,
|
|
||||||
dest: Path,
|
|
||||||
*,
|
|
||||||
repo: str = about.__projects__,
|
|
||||||
branch: str = about.__projects_branch__,
|
|
||||||
sparse_checkout: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Clone a project template from a repository.
|
|
||||||
|
|
||||||
name (str): Name of subdirectory to clone.
|
|
||||||
dest (Path): Destination path of cloned project.
|
|
||||||
repo (str): URL of Git repo containing project templates.
|
|
||||||
branch (str): The branch to clone from
|
|
||||||
"""
|
|
||||||
dest = ensure_path(dest)
|
|
||||||
check_clone(name, dest, repo)
|
|
||||||
project_dir = dest.resolve()
|
|
||||||
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
|
|
||||||
try:
|
|
||||||
git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout)
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')"
|
|
||||||
msg.fail(err, exits=1)
|
|
||||||
msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir)
|
|
||||||
if not (project_dir / PROJECT_FILE).exists():
|
|
||||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
|
||||||
else:
|
|
||||||
msg.good(f"Your project is now ready!")
|
|
||||||
print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
|
|
||||||
|
|
||||||
|
|
||||||
def check_clone(name: str, dest: Path, repo: str) -> None:
|
|
||||||
"""Check and validate that the destination path can be used to clone. Will
|
|
||||||
check that Git is available and that the destination path is suitable.
|
|
||||||
|
|
||||||
name (str): Name of the directory to clone from the repo.
|
|
||||||
dest (Path): Local destination of cloned directory.
|
|
||||||
repo (str): URL of the repo to clone from.
|
|
||||||
"""
|
|
||||||
git_err = (
|
|
||||||
f"Cloning spaCy project templates requires Git and the 'git' command. "
|
|
||||||
f"To clone a project without Git, copy the files from the '{name}' "
|
|
||||||
f"directory in the {repo} to {dest} manually."
|
|
||||||
)
|
|
||||||
get_git_version(error=git_err)
|
|
||||||
if not dest:
|
|
||||||
msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
|
|
||||||
if dest.exists():
|
|
||||||
# Directory already exists (not allowed, clone needs to create it)
|
|
||||||
msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
|
|
||||||
if not dest.parent.exists():
|
|
||||||
# We're not creating parents, parent dir should exist
|
|
||||||
msg.fail(
|
|
||||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
|
|
||||||
f"Create the necessary folder(s) first before continuing.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
|
@ -1,115 +0,0 @@
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg, MarkdownRenderer
|
|
||||||
|
|
||||||
from ...util import working_dir
|
|
||||||
from .._util import project_cli, Arg, Opt, PROJECT_FILE, load_project_config
|
|
||||||
|
|
||||||
|
|
||||||
DOCS_URL = "https://spacy.io"
|
|
||||||
INTRO_PROJECT = f"""The [`{PROJECT_FILE}`]({PROJECT_FILE}) defines the data assets required by the
|
|
||||||
project, as well as the available commands and workflows. For details, see the
|
|
||||||
[spaCy projects documentation]({DOCS_URL}/usage/projects)."""
|
|
||||||
INTRO_COMMANDS = f"""The following commands are defined by the project. They
|
|
||||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run).
|
|
||||||
Commands are only re-run if their inputs have changed."""
|
|
||||||
INTRO_WORKFLOWS = f"""The following workflows are defined by the project. They
|
|
||||||
can be executed using [`spacy project run [name]`]({DOCS_URL}/api/cli#project-run)
|
|
||||||
and will run the specified commands in order. Commands are only re-run if their
|
|
||||||
inputs have changed."""
|
|
||||||
INTRO_ASSETS = f"""The following assets are defined by the project. They can
|
|
||||||
be fetched by running [`spacy project assets`]({DOCS_URL}/api/cli#project-assets)
|
|
||||||
in the project directory."""
|
|
||||||
# These markers are added to the Markdown and can be used to update the file in
|
|
||||||
# place if it already exists. Only the auto-generated part will be replaced.
|
|
||||||
MARKER_START = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->"
|
|
||||||
MARKER_END = "<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->"
|
|
||||||
# If this marker is used in an existing README, it's ignored and not replaced
|
|
||||||
MARKER_IGNORE = "<!-- SPACY PROJECT: IGNORE -->"
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("document")
|
|
||||||
def project_document_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Path to cloned project. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
output_file: Path = Opt("-", "--output", "-o", help="Path to output Markdown file for output. Defaults to - for standard output"),
|
|
||||||
no_emoji: bool = Opt(False, "--no-emoji", "-NE", help="Don't use emoji")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Auto-generate a README.md for a project. If the content is saved to a file,
|
|
||||||
hidden markers are added so you can add custom content before or after the
|
|
||||||
auto-generated section and only the auto-generated docs will be replaced
|
|
||||||
when you re-run the command.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-document
|
|
||||||
"""
|
|
||||||
project_document(project_dir, output_file, no_emoji=no_emoji)
|
|
||||||
|
|
||||||
|
|
||||||
def project_document(
|
|
||||||
project_dir: Path, output_file: Path, *, no_emoji: bool = False
|
|
||||||
) -> None:
|
|
||||||
is_stdout = str(output_file) == "-"
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
md = MarkdownRenderer(no_emoji=no_emoji)
|
|
||||||
md.add(MARKER_START)
|
|
||||||
title = config.get("title")
|
|
||||||
description = config.get("description")
|
|
||||||
md.add(md.title(1, f"spaCy Project{f': {title}' if title else ''}", "🪐"))
|
|
||||||
if description:
|
|
||||||
md.add(description)
|
|
||||||
md.add(md.title(2, PROJECT_FILE, "📋"))
|
|
||||||
md.add(INTRO_PROJECT)
|
|
||||||
# Commands
|
|
||||||
cmds = config.get("commands", [])
|
|
||||||
data = [(md.code(cmd["name"]), cmd.get("help", "")) for cmd in cmds]
|
|
||||||
if data:
|
|
||||||
md.add(md.title(3, "Commands", "⏯"))
|
|
||||||
md.add(INTRO_COMMANDS)
|
|
||||||
md.add(md.table(data, ["Command", "Description"]))
|
|
||||||
# Workflows
|
|
||||||
wfs = config.get("workflows", {}).items()
|
|
||||||
data = [(md.code(n), " → ".join(md.code(w) for w in stp)) for n, stp in wfs]
|
|
||||||
if data:
|
|
||||||
md.add(md.title(3, "Workflows", "⏭"))
|
|
||||||
md.add(INTRO_WORKFLOWS)
|
|
||||||
md.add(md.table(data, ["Workflow", "Steps"]))
|
|
||||||
# Assets
|
|
||||||
assets = config.get("assets", [])
|
|
||||||
data = []
|
|
||||||
for a in assets:
|
|
||||||
source = "Git" if a.get("git") else "URL" if a.get("url") else "Local"
|
|
||||||
dest_path = a["dest"]
|
|
||||||
dest = md.code(dest_path)
|
|
||||||
if source == "Local":
|
|
||||||
# Only link assets if they're in the repo
|
|
||||||
with working_dir(project_dir) as p:
|
|
||||||
if (p / dest_path).exists():
|
|
||||||
dest = md.link(dest, dest_path)
|
|
||||||
data.append((dest, source, a.get("description", "")))
|
|
||||||
if data:
|
|
||||||
md.add(md.title(3, "Assets", "🗂"))
|
|
||||||
md.add(INTRO_ASSETS)
|
|
||||||
md.add(md.table(data, ["File", "Source", "Description"]))
|
|
||||||
md.add(MARKER_END)
|
|
||||||
# Output result
|
|
||||||
if is_stdout:
|
|
||||||
print(md.text)
|
|
||||||
else:
|
|
||||||
content = md.text
|
|
||||||
if output_file.exists():
|
|
||||||
with output_file.open("r", encoding="utf8") as f:
|
|
||||||
existing = f.read()
|
|
||||||
if MARKER_IGNORE in existing:
|
|
||||||
msg.warn("Found ignore marker in existing file: skipping", output_file)
|
|
||||||
return
|
|
||||||
if MARKER_START in existing and MARKER_END in existing:
|
|
||||||
msg.info("Found existing file: only replacing auto-generated docs")
|
|
||||||
before = existing.split(MARKER_START)[0]
|
|
||||||
after = existing.split(MARKER_END)[1]
|
|
||||||
content = f"{before}{content}{after}"
|
|
||||||
else:
|
|
||||||
msg.warn("Replacing existing file")
|
|
||||||
with output_file.open("w", encoding="utf8") as f:
|
|
||||||
f.write(content)
|
|
||||||
msg.good("Saved project documentation", output_file)
|
|
|
@ -1,207 +0,0 @@
|
||||||
"""This module contains helpers and subcommands for integrating spaCy projects
|
|
||||||
with Data Version Controk (DVC). https://dvc.org"""
|
|
||||||
from typing import Dict, Any, List, Optional, Iterable
|
|
||||||
import subprocess
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
|
||||||
from .._util import Arg, Opt, NAME, COMMAND
|
|
||||||
from ...util import working_dir, split_command, join_command, run_command
|
|
||||||
from ...util import SimpleFrozenList
|
|
||||||
|
|
||||||
|
|
||||||
DVC_CONFIG = "dvc.yaml"
|
|
||||||
DVC_DIR = ".dvc"
|
|
||||||
UPDATE_COMMAND = "dvc"
|
|
||||||
DVC_CONFIG_COMMENT = f"""# This file is auto-generated by spaCy based on your {PROJECT_FILE}. If you've
|
|
||||||
# edited your {PROJECT_FILE}, you can regenerate this file by running:
|
|
||||||
# {COMMAND} project {UPDATE_COMMAND}"""
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(UPDATE_COMMAND)
|
|
||||||
def project_update_dvc_cli(
|
|
||||||
# fmt: off
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
workflow: Optional[str] = Arg(None, help=f"Name of workflow defined in {PROJECT_FILE}. Defaults to first workflow if not set."),
|
|
||||||
verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
|
|
||||||
quiet: bool = Opt(False, "--quiet", "-q", help="Print less info"),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Auto-generate Data Version Control (DVC) config. A DVC
|
|
||||||
project can only define one pipeline, so you need to specify one workflow
|
|
||||||
defined in the project.yml. If no workflow is specified, the first defined
|
|
||||||
workflow is used. The DVC config will only be updated if the project.yml
|
|
||||||
changed.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-dvc
|
|
||||||
"""
|
|
||||||
project_update_dvc(project_dir, workflow, verbose=verbose, quiet=quiet, force=force)
|
|
||||||
|
|
||||||
|
|
||||||
def project_update_dvc(
|
|
||||||
project_dir: Path,
|
|
||||||
workflow: Optional[str] = None,
|
|
||||||
*,
|
|
||||||
verbose: bool = False,
|
|
||||||
quiet: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Update the auto-generated Data Version Control (DVC) config file. A DVC
|
|
||||||
project can only define one pipeline, so you need to specify one workflow
|
|
||||||
defined in the project.yml. Will only update the file if the checksum changed.
|
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
|
||||||
workflow (Optional[str]): Optional name of workflow defined in project.yml.
|
|
||||||
If not set, the first workflow will be used.
|
|
||||||
verbose (bool): Print more info.
|
|
||||||
quiet (bool): Print less info.
|
|
||||||
force (bool): Force update DVC config.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
updated = update_dvc_config(
|
|
||||||
project_dir, config, workflow, verbose=verbose, quiet=quiet, force=force
|
|
||||||
)
|
|
||||||
help_msg = "To execute the workflow with DVC, run: dvc repro"
|
|
||||||
if updated:
|
|
||||||
msg.good(f"Updated DVC config from {PROJECT_FILE}", help_msg)
|
|
||||||
else:
|
|
||||||
msg.info(f"No changes found in {PROJECT_FILE}, no update needed", help_msg)
|
|
||||||
|
|
||||||
|
|
||||||
def update_dvc_config(
|
|
||||||
path: Path,
|
|
||||||
config: Dict[str, Any],
|
|
||||||
workflow: Optional[str] = None,
|
|
||||||
verbose: bool = False,
|
|
||||||
quiet: bool = False,
|
|
||||||
force: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""Re-run the DVC commands in dry mode and update dvc.yaml file in the
|
|
||||||
project directory. The file is auto-generated based on the config. The
|
|
||||||
first line of the auto-generated file specifies the hash of the config
|
|
||||||
dict, so if any of the config values change, the DVC config is regenerated.
|
|
||||||
|
|
||||||
path (Path): The path to the project directory.
|
|
||||||
config (Dict[str, Any]): The loaded project.yml.
|
|
||||||
verbose (bool): Whether to print additional info (via DVC).
|
|
||||||
quiet (bool): Don't output anything (via DVC).
|
|
||||||
force (bool): Force update, even if hashes match.
|
|
||||||
RETURNS (bool): Whether the DVC config file was updated.
|
|
||||||
"""
|
|
||||||
ensure_dvc(path)
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
workflow_names = list(workflows.keys())
|
|
||||||
check_workflows(workflow_names, workflow)
|
|
||||||
if not workflow:
|
|
||||||
workflow = workflow_names[0]
|
|
||||||
config_hash = get_hash(config)
|
|
||||||
path = path.resolve()
|
|
||||||
dvc_config_path = path / DVC_CONFIG
|
|
||||||
if dvc_config_path.exists():
|
|
||||||
# Check if the file was generated using the current config, if not, redo
|
|
||||||
with dvc_config_path.open("r", encoding="utf8") as f:
|
|
||||||
ref_hash = f.readline().strip().replace("# ", "")
|
|
||||||
if ref_hash == config_hash and not force:
|
|
||||||
return False # Nothing has changed in project.yml, don't need to update
|
|
||||||
dvc_config_path.unlink()
|
|
||||||
dvc_commands = []
|
|
||||||
config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
|
||||||
|
|
||||||
# some flags that apply to every command
|
|
||||||
flags = []
|
|
||||||
if verbose:
|
|
||||||
flags.append("--verbose")
|
|
||||||
if quiet:
|
|
||||||
flags.append("--quiet")
|
|
||||||
|
|
||||||
for name in workflows[workflow]:
|
|
||||||
command = config_commands[name]
|
|
||||||
deps = command.get("deps", [])
|
|
||||||
outputs = command.get("outputs", [])
|
|
||||||
outputs_no_cache = command.get("outputs_no_cache", [])
|
|
||||||
if not deps and not outputs and not outputs_no_cache:
|
|
||||||
continue
|
|
||||||
# Default to the working dir as the project path since dvc.yaml is auto-generated
|
|
||||||
# and we don't want arbitrary paths in there
|
|
||||||
project_cmd = ["python", "-m", NAME, "project", "run", name]
|
|
||||||
deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
|
|
||||||
outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
|
|
||||||
outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
|
|
||||||
|
|
||||||
dvc_cmd = ["run", *flags, "-n", name, "-w", str(path), "--no-exec"]
|
|
||||||
if command.get("no_skip"):
|
|
||||||
dvc_cmd.append("--always-changed")
|
|
||||||
full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
|
|
||||||
dvc_commands.append(join_command(full_cmd))
|
|
||||||
|
|
||||||
if not dvc_commands:
|
|
||||||
# If we don't check for this, then there will be an error when reading the
|
|
||||||
# config, since DVC wouldn't create it.
|
|
||||||
msg.fail(
|
|
||||||
"No usable commands for DVC found. This can happen if none of your "
|
|
||||||
"commands have dependencies or outputs.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
with working_dir(path):
|
|
||||||
for c in dvc_commands:
|
|
||||||
dvc_command = "dvc " + c
|
|
||||||
run_command(dvc_command)
|
|
||||||
with dvc_config_path.open("r+", encoding="utf8") as f:
|
|
||||||
content = f.read()
|
|
||||||
f.seek(0, 0)
|
|
||||||
f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None:
|
|
||||||
"""Validate workflows provided in project.yml and check that a given
|
|
||||||
workflow can be used to generate a DVC config.
|
|
||||||
|
|
||||||
workflows (List[str]): Names of the available workflows.
|
|
||||||
workflow (Optional[str]): The name of the workflow to convert.
|
|
||||||
"""
|
|
||||||
if not workflows:
|
|
||||||
msg.fail(
|
|
||||||
f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, "
|
|
||||||
f"define at least one list of commands.",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if workflow is not None and workflow not in workflows:
|
|
||||||
msg.fail(
|
|
||||||
f"Workflow '{workflow}' not defined in {PROJECT_FILE}. "
|
|
||||||
f"Available workflows: {', '.join(workflows)}",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not workflow:
|
|
||||||
msg.warn(
|
|
||||||
f"No workflow specified for DVC pipeline. Using the first workflow "
|
|
||||||
f"defined in {PROJECT_FILE}: '{workflows[0]}'"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_dvc(project_dir: Path) -> None:
|
|
||||||
"""Ensure that the "dvc" command is available and that the current project
|
|
||||||
directory is an initialized DVC project.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
|
|
||||||
except Exception:
|
|
||||||
msg.fail(
|
|
||||||
"To use spaCy projects with DVC (Data Version Control), DVC needs "
|
|
||||||
"to be installed and the 'dvc' command needs to be available",
|
|
||||||
"You can install the Python package from pip (pip install dvc) or "
|
|
||||||
"conda (conda install -c conda-forge dvc). For more details, see the "
|
|
||||||
"documentation: https://dvc.org/doc/install",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
if not (project_dir / ".dvc").exists():
|
|
||||||
msg.fail(
|
|
||||||
"Project not initialized as a DVC project",
|
|
||||||
"To initialize a DVC project, you can run 'dvc init' in the project "
|
|
||||||
"directory. For more details, see the documentation: "
|
|
||||||
"https://dvc.org/doc/command-reference/init",
|
|
||||||
exits=1,
|
|
||||||
)
|
|
|
@ -1,67 +0,0 @@
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
from .remote_storage import RemoteStorage
|
|
||||||
from .remote_storage import get_command_hash
|
|
||||||
from .._util import project_cli, Arg, logger
|
|
||||||
from .._util import load_project_config
|
|
||||||
from .run import update_lockfile
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("pull")
|
|
||||||
def project_pull_cli(
|
|
||||||
# fmt: off
|
|
||||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Retrieve available precomputed outputs from a remote storage.
|
|
||||||
You can alias remotes in your project.yml by mapping them to storage paths.
|
|
||||||
A storage can be anything that the smart-open library can upload to, e.g.
|
|
||||||
AWS, Google Cloud Storage, SSH, local directories etc.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-pull
|
|
||||||
"""
|
|
||||||
for url, output_path in project_pull(project_dir, remote):
|
|
||||||
if url is not None:
|
|
||||||
msg.good(f"Pulled {output_path} from {url}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|
||||||
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
|
||||||
# set up. I guess see if it breaks first?
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
if remote in config.get("remotes", {}):
|
|
||||||
remote = config["remotes"][remote]
|
|
||||||
storage = RemoteStorage(project_dir, remote)
|
|
||||||
commands = list(config.get("commands", []))
|
|
||||||
# We use a while loop here because we don't know how the commands
|
|
||||||
# will be ordered. A command might need dependencies from one that's later
|
|
||||||
# in the list.
|
|
||||||
while commands:
|
|
||||||
for i, cmd in enumerate(list(commands)):
|
|
||||||
logger.debug("CMD: %s.", cmd["name"])
|
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
|
||||||
if all(dep.exists() for dep in deps):
|
|
||||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
|
||||||
for output_path in cmd.get("outputs", []):
|
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
|
||||||
logger.debug(
|
|
||||||
"URL: %s for %s with command hash %s",
|
|
||||||
url,
|
|
||||||
output_path,
|
|
||||||
cmd_hash,
|
|
||||||
)
|
|
||||||
yield url, output_path
|
|
||||||
|
|
||||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
|
||||||
if all(loc.exists() for loc in out_locs):
|
|
||||||
update_lockfile(project_dir, cmd)
|
|
||||||
# We remove the command from the list here, and break, so that
|
|
||||||
# we iterate over the loop again.
|
|
||||||
commands.pop(i)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
|
||||||
else:
|
|
||||||
# If we didn't break the for loop, break the while loop.
|
|
||||||
break
|
|
|
@ -1,69 +0,0 @@
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
from .remote_storage import RemoteStorage
|
|
||||||
from .remote_storage import get_content_hash, get_command_hash
|
|
||||||
from .._util import load_project_config
|
|
||||||
from .._util import project_cli, Arg, logger
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command("push")
|
|
||||||
def project_push_cli(
|
|
||||||
# fmt: off
|
|
||||||
remote: str = Arg("default", help="Name or path of remote storage"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Persist outputs to a remote storage. You can alias remotes in your
|
|
||||||
project.yml by mapping them to storage paths. A storage can be anything that
|
|
||||||
the smart-open library can upload to, e.g. AWS, Google Cloud Storage, SSH,
|
|
||||||
local directories etc.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-push
|
|
||||||
"""
|
|
||||||
for output_path, url in project_push(project_dir, remote):
|
|
||||||
if url is None:
|
|
||||||
msg.info(f"Skipping {output_path}")
|
|
||||||
else:
|
|
||||||
msg.good(f"Pushed {output_path} to {url}")
|
|
||||||
|
|
||||||
|
|
||||||
def project_push(project_dir: Path, remote: str):
|
|
||||||
"""Persist outputs to a remote storage. You can alias remotes in your project.yml
|
|
||||||
by mapping them to storage paths. A storage can be anything that the smart-open
|
|
||||||
library can upload to, e.g. gcs, aws, ssh, local directories etc
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
if remote in config.get("remotes", {}):
|
|
||||||
remote = config["remotes"][remote]
|
|
||||||
storage = RemoteStorage(project_dir, remote)
|
|
||||||
for cmd in config.get("commands", []):
|
|
||||||
logger.debug("CMD: %s", cmd["name"])
|
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
|
||||||
if any(not dep.exists() for dep in deps):
|
|
||||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
|
||||||
continue
|
|
||||||
cmd_hash = get_command_hash(
|
|
||||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
|
||||||
)
|
|
||||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
|
||||||
for output_path in cmd.get("outputs", []):
|
|
||||||
output_loc = project_dir / output_path
|
|
||||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
|
||||||
url = storage.push(
|
|
||||||
output_path,
|
|
||||||
command_hash=cmd_hash,
|
|
||||||
content_hash=get_content_hash(output_loc),
|
|
||||||
)
|
|
||||||
logger.debug(
|
|
||||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
|
||||||
)
|
|
||||||
yield output_path, url
|
|
||||||
|
|
||||||
|
|
||||||
def _is_not_empty_dir(loc: Path):
|
|
||||||
if not loc.is_dir():
|
|
||||||
return True
|
|
||||||
elif any(_is_not_empty_dir(child) for child in loc.iterdir()):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
|
@ -1,205 +0,0 @@
|
||||||
from typing import Optional, List, Dict, TYPE_CHECKING
|
|
||||||
import os
|
|
||||||
import site
|
|
||||||
import hashlib
|
|
||||||
import urllib.parse
|
|
||||||
import tarfile
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
|
|
||||||
from .._util import get_hash, get_checksum, upload_file, download_file
|
|
||||||
from .._util import ensure_pathy, make_tempdir
|
|
||||||
from ...util import get_minor_version, ENV_VARS, check_bool_env_var
|
|
||||||
from ...git_info import GIT_VERSION
|
|
||||||
from ... import about
|
|
||||||
from ...errors import Errors
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pathy import FluidPath # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteStorage:
|
|
||||||
"""Push and pull outputs to and from a remote file storage.
|
|
||||||
|
|
||||||
Remotes can be anything that `smart-open` can support: AWS, GCS, file system,
|
|
||||||
ssh, etc.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, project_root: Path, url: str, *, compression="gz"):
|
|
||||||
self.root = project_root
|
|
||||||
self.url = ensure_pathy(url)
|
|
||||||
self.compression = compression
|
|
||||||
|
|
||||||
def push(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
|
||||||
"""Compress a file or directory within a project and upload it to a remote
|
|
||||||
storage. If an object exists at the full URL, nothing is done.
|
|
||||||
|
|
||||||
Within the remote storage, files are addressed by their project path
|
|
||||||
(url encoded) and two user-supplied hashes, representing their creation
|
|
||||||
context and their file contents. If the URL already exists, the data is
|
|
||||||
not uploaded. Paths are archived and compressed prior to upload.
|
|
||||||
"""
|
|
||||||
loc = self.root / path
|
|
||||||
if not loc.exists():
|
|
||||||
raise IOError(f"Cannot push {loc}: does not exist.")
|
|
||||||
url = self.make_url(path, command_hash, content_hash)
|
|
||||||
if url.exists():
|
|
||||||
return url
|
|
||||||
tmp: Path
|
|
||||||
with make_tempdir() as tmp:
|
|
||||||
tar_loc = tmp / self.encode_name(str(path))
|
|
||||||
mode_string = f"w:{self.compression}" if self.compression else "w"
|
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
|
||||||
tar_file.add(str(loc), arcname=str(path))
|
|
||||||
upload_file(tar_loc, url)
|
|
||||||
return url
|
|
||||||
|
|
||||||
def pull(
|
|
||||||
self,
|
|
||||||
path: Path,
|
|
||||||
*,
|
|
||||||
command_hash: Optional[str] = None,
|
|
||||||
content_hash: Optional[str] = None,
|
|
||||||
) -> Optional["FluidPath"]:
|
|
||||||
"""Retrieve a file from the remote cache. If the file already exists,
|
|
||||||
nothing is done.
|
|
||||||
|
|
||||||
If the command_hash and/or content_hash are specified, only matching
|
|
||||||
results are returned. If no results are available, an error is raised.
|
|
||||||
"""
|
|
||||||
dest = self.root / path
|
|
||||||
if dest.exists():
|
|
||||||
return None
|
|
||||||
url = self.find(path, command_hash=command_hash, content_hash=content_hash)
|
|
||||||
if url is None:
|
|
||||||
return url
|
|
||||||
else:
|
|
||||||
# Make sure the destination exists
|
|
||||||
if not dest.parent.exists():
|
|
||||||
dest.parent.mkdir(parents=True)
|
|
||||||
tmp: Path
|
|
||||||
with make_tempdir() as tmp:
|
|
||||||
tar_loc = tmp / url.parts[-1]
|
|
||||||
download_file(url, tar_loc)
|
|
||||||
mode_string = f"r:{self.compression}" if self.compression else "r"
|
|
||||||
with tarfile.open(tar_loc, mode=mode_string) as tar_file:
|
|
||||||
# This requires that the path is added correctly, relative
|
|
||||||
# to root. This is how we set things up in push()
|
|
||||||
|
|
||||||
# Disallow paths outside the current directory for the tar
|
|
||||||
# file (CVE-2007-4559, directory traversal vulnerability)
|
|
||||||
def is_within_directory(directory, target):
|
|
||||||
abs_directory = os.path.abspath(directory)
|
|
||||||
abs_target = os.path.abspath(target)
|
|
||||||
prefix = os.path.commonprefix([abs_directory, abs_target])
|
|
||||||
return prefix == abs_directory
|
|
||||||
|
|
||||||
def safe_extract(tar, path):
|
|
||||||
for member in tar.getmembers():
|
|
||||||
member_path = os.path.join(path, member.name)
|
|
||||||
if not is_within_directory(path, member_path):
|
|
||||||
raise ValueError(Errors.E852)
|
|
||||||
tar.extractall(path)
|
|
||||||
|
|
||||||
safe_extract(tar_file, self.root)
|
|
||||||
return url
|
|
||||||
|
|
||||||
def find(
|
|
||||||
self,
|
|
||||||
path: Path,
|
|
||||||
*,
|
|
||||||
command_hash: Optional[str] = None,
|
|
||||||
content_hash: Optional[str] = None,
|
|
||||||
) -> Optional["FluidPath"]:
|
|
||||||
"""Find the best matching version of a file within the storage,
|
|
||||||
or `None` if no match can be found. If both the creation and content hash
|
|
||||||
are specified, only exact matches will be returned. Otherwise, the most
|
|
||||||
recent matching file is preferred.
|
|
||||||
"""
|
|
||||||
name = self.encode_name(str(path))
|
|
||||||
urls = []
|
|
||||||
if command_hash is not None and content_hash is not None:
|
|
||||||
url = self.url / name / command_hash / content_hash
|
|
||||||
urls = [url] if url.exists() else []
|
|
||||||
elif command_hash is not None:
|
|
||||||
if (self.url / name / command_hash).exists():
|
|
||||||
urls = list((self.url / name / command_hash).iterdir())
|
|
||||||
else:
|
|
||||||
if (self.url / name).exists():
|
|
||||||
for sub_dir in (self.url / name).iterdir():
|
|
||||||
urls.extend(sub_dir.iterdir())
|
|
||||||
if content_hash is not None:
|
|
||||||
urls = [url for url in urls if url.parts[-1] == content_hash]
|
|
||||||
if len(urls) >= 2:
|
|
||||||
try:
|
|
||||||
urls.sort(key=lambda x: x.stat().last_modified) # type: ignore
|
|
||||||
except Exception:
|
|
||||||
msg.warn(
|
|
||||||
"Unable to sort remote files by last modified. The file(s) "
|
|
||||||
"pulled from the cache may not be the most recent."
|
|
||||||
)
|
|
||||||
return urls[-1] if urls else None
|
|
||||||
|
|
||||||
def make_url(self, path: Path, command_hash: str, content_hash: str) -> "FluidPath":
|
|
||||||
"""Construct a URL from a subpath, a creation hash and a content hash."""
|
|
||||||
return self.url / self.encode_name(str(path)) / command_hash / content_hash
|
|
||||||
|
|
||||||
def encode_name(self, name: str) -> str:
|
|
||||||
"""Encode a subpath into a URL-safe name."""
|
|
||||||
return urllib.parse.quote_plus(name)
|
|
||||||
|
|
||||||
|
|
||||||
def get_content_hash(loc: Path) -> str:
|
|
||||||
return get_checksum(loc)
|
|
||||||
|
|
||||||
|
|
||||||
def get_command_hash(
|
|
||||||
site_hash: str, env_hash: str, deps: List[Path], cmd: List[str]
|
|
||||||
) -> str:
|
|
||||||
"""Create a hash representing the execution of a command. This includes the
|
|
||||||
currently installed packages, whatever environment variables have been marked
|
|
||||||
as relevant, and the command.
|
|
||||||
"""
|
|
||||||
if check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION):
|
|
||||||
spacy_v = GIT_VERSION
|
|
||||||
else:
|
|
||||||
spacy_v = str(get_minor_version(about.__version__) or "")
|
|
||||||
dep_checksums = [get_checksum(dep) for dep in sorted(deps)]
|
|
||||||
hashes = [spacy_v, site_hash, env_hash] + dep_checksums
|
|
||||||
hashes.extend(cmd)
|
|
||||||
creation_bytes = "".join(hashes).encode("utf8")
|
|
||||||
return hashlib.md5(creation_bytes).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_site_hash():
|
|
||||||
"""Hash the current Python environment's site-packages contents, including
|
|
||||||
the name and version of the libraries. The list we're hashing is what
|
|
||||||
`pip freeze` would output.
|
|
||||||
"""
|
|
||||||
site_dirs = site.getsitepackages()
|
|
||||||
if site.ENABLE_USER_SITE:
|
|
||||||
site_dirs.extend(site.getusersitepackages())
|
|
||||||
packages = set()
|
|
||||||
for site_dir in site_dirs:
|
|
||||||
site_dir = Path(site_dir)
|
|
||||||
for subpath in site_dir.iterdir():
|
|
||||||
if subpath.parts[-1].endswith("dist-info"):
|
|
||||||
packages.add(subpath.parts[-1].replace(".dist-info", ""))
|
|
||||||
package_bytes = "".join(sorted(packages)).encode("utf8")
|
|
||||||
return hashlib.md5sum(package_bytes).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_env_hash(env: Dict[str, str]) -> str:
|
|
||||||
"""Construct a hash of the environment variables that will be passed into
|
|
||||||
the commands.
|
|
||||||
|
|
||||||
Values in the env dict may be references to the current os.environ, using
|
|
||||||
the syntax $ENV_VAR to mean os.environ[ENV_VAR]
|
|
||||||
"""
|
|
||||||
env_vars = {}
|
|
||||||
for key, value in env.items():
|
|
||||||
if value.startswith("$"):
|
|
||||||
env_vars[key] = os.environ.get(value[1:], "")
|
|
||||||
else:
|
|
||||||
env_vars[key] = value
|
|
||||||
return get_hash(env_vars)
|
|
|
@ -1,360 +0,0 @@
|
||||||
from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|
||||||
import os.path
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from wasabi import msg
|
|
||||||
from wasabi.util import locale_escape
|
|
||||||
import sys
|
|
||||||
import srsly
|
|
||||||
import typer
|
|
||||||
|
|
||||||
from ... import about
|
|
||||||
from ...git_info import GIT_VERSION
|
|
||||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
|
||||||
from ...util import SimpleFrozenList, is_minor_version_match, ENV_VARS
|
|
||||||
from ...util import check_bool_env_var, SimpleFrozenDict
|
|
||||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
|
||||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND, parse_config_overrides
|
|
||||||
|
|
||||||
|
|
||||||
@project_cli.command(
|
|
||||||
"run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
|
|
||||||
)
|
|
||||||
def project_run_cli(
|
|
||||||
# fmt: off
|
|
||||||
ctx: typer.Context, # This is only used to read additional arguments
|
|
||||||
subcommand: str = Arg(None, help=f"Name of command defined in the {PROJECT_FILE}"),
|
|
||||||
project_dir: Path = Arg(Path.cwd(), help="Location of project directory. Defaults to current working directory.", exists=True, file_okay=False),
|
|
||||||
force: bool = Opt(False, "--force", "-F", help="Force re-running steps, even if nothing changed"),
|
|
||||||
dry: bool = Opt(False, "--dry", "-D", help="Perform a dry run and don't execute scripts"),
|
|
||||||
show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
|
|
||||||
# fmt: on
|
|
||||||
):
|
|
||||||
"""Run a named command or workflow defined in the project.yml. If a workflow
|
|
||||||
name is specified, all commands in the workflow are run, in order. If
|
|
||||||
commands define dependencies and/or outputs, they will only be re-run if
|
|
||||||
state has changed.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/cli#project-run
|
|
||||||
"""
|
|
||||||
if show_help or not subcommand:
|
|
||||||
print_run_help(project_dir, subcommand)
|
|
||||||
else:
|
|
||||||
overrides = parse_config_overrides(ctx.args)
|
|
||||||
project_run(project_dir, subcommand, overrides=overrides, force=force, dry=dry)
|
|
||||||
|
|
||||||
|
|
||||||
def project_run(
|
|
||||||
project_dir: Path,
|
|
||||||
subcommand: str,
|
|
||||||
*,
|
|
||||||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
|
||||||
force: bool = False,
|
|
||||||
dry: bool = False,
|
|
||||||
capture: bool = False,
|
|
||||||
skip_requirements_check: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Run a named script defined in the project.yml. If the script is part
|
|
||||||
of the default pipeline (defined in the "run" section), DVC is used to
|
|
||||||
execute the command, so it can determine whether to rerun it. It then
|
|
||||||
calls into "exec" to execute it.
|
|
||||||
|
|
||||||
project_dir (Path): Path to project directory.
|
|
||||||
subcommand (str): Name of command to run.
|
|
||||||
overrides (Dict[str, Any]): Optional config overrides.
|
|
||||||
force (bool): Force re-running, even if nothing changed.
|
|
||||||
dry (bool): Perform a dry run and don't execute commands.
|
|
||||||
capture (bool): Whether to capture the output and errors of individual commands.
|
|
||||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
|
||||||
sys.exit will be called with the return code. You should use capture=False
|
|
||||||
when you want to turn over execution to the command, and capture=True
|
|
||||||
when you want to run the command more like a function.
|
|
||||||
skip_requirements_check (bool): Whether to skip the requirements check.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir, overrides=overrides)
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
|
||||||
|
|
||||||
req_path = project_dir / "requirements.txt"
|
|
||||||
if not skip_requirements_check:
|
|
||||||
if config.get("check_requirements", True) and os.path.exists(req_path):
|
|
||||||
with req_path.open() as requirements_file:
|
|
||||||
_check_requirements([req.strip() for req in requirements_file])
|
|
||||||
|
|
||||||
if subcommand in workflows:
|
|
||||||
msg.info(f"Running workflow '{subcommand}'")
|
|
||||||
for cmd in workflows[subcommand]:
|
|
||||||
project_run(
|
|
||||||
project_dir,
|
|
||||||
cmd,
|
|
||||||
overrides=overrides,
|
|
||||||
force=force,
|
|
||||||
dry=dry,
|
|
||||||
capture=capture,
|
|
||||||
skip_requirements_check=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cmd = commands[subcommand]
|
|
||||||
for dep in cmd.get("deps", []):
|
|
||||||
if not (project_dir / dep).exists():
|
|
||||||
err = f"Missing dependency specified by command '{subcommand}': {dep}"
|
|
||||||
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
|
|
||||||
err_exits = 1 if not dry else None
|
|
||||||
msg.fail(err, err_help, exits=err_exits)
|
|
||||||
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
|
|
||||||
with working_dir(project_dir) as current_dir:
|
|
||||||
msg.divider(subcommand)
|
|
||||||
rerun = check_rerun(current_dir, cmd, check_spacy_commit=check_spacy_commit)
|
|
||||||
if not rerun and not force:
|
|
||||||
msg.info(f"Skipping '{cmd['name']}': nothing changed")
|
|
||||||
else:
|
|
||||||
run_commands(cmd["script"], dry=dry, capture=capture)
|
|
||||||
if not dry:
|
|
||||||
update_lockfile(current_dir, cmd)
|
|
||||||
|
|
||||||
|
|
||||||
def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|
||||||
"""Simulate a CLI help prompt using the info available in the project.yml.
|
|
||||||
|
|
||||||
project_dir (Path): The project directory.
|
|
||||||
subcommand (Optional[str]): The subcommand or None. If a subcommand is
|
|
||||||
provided, the subcommand help is shown. Otherwise, the top-level help
|
|
||||||
and a list of available commands is printed.
|
|
||||||
"""
|
|
||||||
config = load_project_config(project_dir)
|
|
||||||
config_commands = config.get("commands", [])
|
|
||||||
commands = {cmd["name"]: cmd for cmd in config_commands}
|
|
||||||
workflows = config.get("workflows", {})
|
|
||||||
project_loc = "" if is_cwd(project_dir) else project_dir
|
|
||||||
if subcommand:
|
|
||||||
validate_subcommand(list(commands.keys()), list(workflows.keys()), subcommand)
|
|
||||||
print(f"Usage: {COMMAND} project run {subcommand} {project_loc}")
|
|
||||||
if subcommand in commands:
|
|
||||||
help_text = commands[subcommand].get("help")
|
|
||||||
if help_text:
|
|
||||||
print(f"\n{help_text}\n")
|
|
||||||
elif subcommand in workflows:
|
|
||||||
steps = workflows[subcommand]
|
|
||||||
print(f"\nWorkflow consisting of {len(steps)} commands:")
|
|
||||||
steps_data = [
|
|
||||||
(f"{i + 1}. {step}", commands[step].get("help", ""))
|
|
||||||
for i, step in enumerate(steps)
|
|
||||||
]
|
|
||||||
msg.table(steps_data)
|
|
||||||
help_cmd = f"{COMMAND} project run [COMMAND] {project_loc} --help"
|
|
||||||
print(f"For command details, run: {help_cmd}")
|
|
||||||
else:
|
|
||||||
print("")
|
|
||||||
title = config.get("title")
|
|
||||||
if title:
|
|
||||||
print(f"{locale_escape(title)}\n")
|
|
||||||
if config_commands:
|
|
||||||
print(f"Available commands in {PROJECT_FILE}")
|
|
||||||
print(f"Usage: {COMMAND} project run [COMMAND] {project_loc}")
|
|
||||||
msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
|
|
||||||
if workflows:
|
|
||||||
print(f"Available workflows in {PROJECT_FILE}")
|
|
||||||
print(f"Usage: {COMMAND} project run [WORKFLOW] {project_loc}")
|
|
||||||
msg.table([(name, " -> ".join(steps)) for name, steps in workflows.items()])
|
|
||||||
|
|
||||||
|
|
||||||
def run_commands(
|
|
||||||
commands: Iterable[str] = SimpleFrozenList(),
|
|
||||||
silent: bool = False,
|
|
||||||
dry: bool = False,
|
|
||||||
capture: bool = False,
|
|
||||||
) -> None:
|
|
||||||
"""Run a sequence of commands in a subprocess, in order.
|
|
||||||
|
|
||||||
commands (List[str]): The string commands.
|
|
||||||
silent (bool): Don't print the commands.
|
|
||||||
dry (bool): Perform a dry run and don't execut anything.
|
|
||||||
capture (bool): Whether to capture the output and errors of individual commands.
|
|
||||||
If False, the stdout and stderr will not be redirected, and if there's an error,
|
|
||||||
sys.exit will be called with the return code. You should use capture=False
|
|
||||||
when you want to turn over execution to the command, and capture=True
|
|
||||||
when you want to run the command more like a function.
|
|
||||||
"""
|
|
||||||
for c in commands:
|
|
||||||
command = split_command(c)
|
|
||||||
# Not sure if this is needed or a good idea. Motivation: users may often
|
|
||||||
# use commands in their config that reference "python" and we want to
|
|
||||||
# make sure that it's always executing the same Python that spaCy is
|
|
||||||
# executed with and the pip in the same env, not some other Python/pip.
|
|
||||||
# Also ensures cross-compatibility if user 1 writes "python3" (because
|
|
||||||
# that's how it's set up on their system), and user 2 without the
|
|
||||||
# shortcut tries to re-run the command.
|
|
||||||
if len(command) and command[0] in ("python", "python3"):
|
|
||||||
command[0] = sys.executable
|
|
||||||
elif len(command) and command[0] in ("pip", "pip3"):
|
|
||||||
command = [sys.executable, "-m", "pip", *command[1:]]
|
|
||||||
if not silent:
|
|
||||||
print(f"Running command: {join_command(command)}")
|
|
||||||
if not dry:
|
|
||||||
run_command(command, capture=capture)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_subcommand(
|
|
||||||
commands: Sequence[str], workflows: Sequence[str], subcommand: str
|
|
||||||
) -> None:
|
|
||||||
"""Check that a subcommand is valid and defined. Raises an error otherwise.
|
|
||||||
|
|
||||||
commands (Sequence[str]): The available commands.
|
|
||||||
subcommand (str): The subcommand.
|
|
||||||
"""
|
|
||||||
if not commands and not workflows:
|
|
||||||
msg.fail(f"No commands or workflows defined in {PROJECT_FILE}", exits=1)
|
|
||||||
if subcommand not in commands and subcommand not in workflows:
|
|
||||||
help_msg = []
|
|
||||||
if subcommand in ["assets", "asset"]:
|
|
||||||
help_msg.append("Did you mean to run: python -m spacy project assets?")
|
|
||||||
if commands:
|
|
||||||
help_msg.append(f"Available commands: {', '.join(commands)}")
|
|
||||||
if workflows:
|
|
||||||
help_msg.append(f"Available workflows: {', '.join(workflows)}")
|
|
||||||
msg.fail(
|
|
||||||
f"Can't find command or workflow '{subcommand}' in {PROJECT_FILE}",
|
|
||||||
". ".join(help_msg),
|
|
||||||
exits=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def check_rerun(
|
|
||||||
project_dir: Path,
|
|
||||||
command: Dict[str, Any],
|
|
||||||
*,
|
|
||||||
check_spacy_version: bool = True,
|
|
||||||
check_spacy_commit: bool = False,
|
|
||||||
) -> bool:
|
|
||||||
"""Check if a command should be rerun because its settings or inputs/outputs
|
|
||||||
changed.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
|
||||||
strict_version (bool):
|
|
||||||
RETURNS (bool): Whether to re-run the command.
|
|
||||||
"""
|
|
||||||
# Always rerun if no-skip is set
|
|
||||||
if command.get("no_skip", False):
|
|
||||||
return True
|
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
|
||||||
if not lock_path.exists(): # We don't have a lockfile, run command
|
|
||||||
return True
|
|
||||||
data = srsly.read_yaml(lock_path)
|
|
||||||
if command["name"] not in data: # We don't have info about this command
|
|
||||||
return True
|
|
||||||
entry = data[command["name"]]
|
|
||||||
# Always run commands with no outputs (otherwise they'd always be skipped)
|
|
||||||
if not entry.get("outs", []):
|
|
||||||
return True
|
|
||||||
# Always rerun if spaCy version or commit hash changed
|
|
||||||
spacy_v = entry.get("spacy_version")
|
|
||||||
commit = entry.get("spacy_git_version")
|
|
||||||
if check_spacy_version and not is_minor_version_match(spacy_v, about.__version__):
|
|
||||||
info = f"({spacy_v} in {PROJECT_LOCK}, {about.__version__} current)"
|
|
||||||
msg.info(f"Re-running '{command['name']}': spaCy minor version changed {info}")
|
|
||||||
return True
|
|
||||||
if check_spacy_commit and commit != GIT_VERSION:
|
|
||||||
info = f"({commit} in {PROJECT_LOCK}, {GIT_VERSION} current)"
|
|
||||||
msg.info(f"Re-running '{command['name']}': spaCy commit changed {info}")
|
|
||||||
return True
|
|
||||||
# If the entry in the lockfile matches the lockfile entry that would be
|
|
||||||
# generated from the current command, we don't rerun because it means that
|
|
||||||
# all inputs/outputs, hashes and scripts are the same and nothing changed
|
|
||||||
lock_entry = get_lock_entry(project_dir, command)
|
|
||||||
exclude = ["spacy_version", "spacy_git_version"]
|
|
||||||
return get_hash(lock_entry, exclude=exclude) != get_hash(entry, exclude=exclude)
|
|
||||||
|
|
||||||
|
|
||||||
def update_lockfile(project_dir: Path, command: Dict[str, Any]) -> None:
|
|
||||||
"""Update the lockfile after running a command. Will create a lockfile if
|
|
||||||
it doesn't yet exist and will add an entry for the current command, its
|
|
||||||
script and dependencies/outputs.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
|
||||||
"""
|
|
||||||
lock_path = project_dir / PROJECT_LOCK
|
|
||||||
if not lock_path.exists():
|
|
||||||
srsly.write_yaml(lock_path, {})
|
|
||||||
data = {}
|
|
||||||
else:
|
|
||||||
data = srsly.read_yaml(lock_path)
|
|
||||||
data[command["name"]] = get_lock_entry(project_dir, command)
|
|
||||||
srsly.write_yaml(lock_path, data)
|
|
||||||
|
|
||||||
|
|
||||||
def get_lock_entry(project_dir: Path, command: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Get a lockfile entry for a given command. An entry includes the command,
|
|
||||||
the script (command steps) and a list of dependencies and outputs with
|
|
||||||
their paths and file hashes, if available. The format is based on the
|
|
||||||
dvc.lock files, to keep things consistent.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
command (Dict[str, Any]): The command, as defined in the project.yml.
|
|
||||||
RETURNS (Dict[str, Any]): The lockfile entry.
|
|
||||||
"""
|
|
||||||
deps = get_fileinfo(project_dir, command.get("deps", []))
|
|
||||||
outs = get_fileinfo(project_dir, command.get("outputs", []))
|
|
||||||
outs_nc = get_fileinfo(project_dir, command.get("outputs_no_cache", []))
|
|
||||||
return {
|
|
||||||
"cmd": f"{COMMAND} run {command['name']}",
|
|
||||||
"script": command["script"],
|
|
||||||
"deps": deps,
|
|
||||||
"outs": [*outs, *outs_nc],
|
|
||||||
"spacy_version": about.__version__,
|
|
||||||
"spacy_git_version": GIT_VERSION,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_fileinfo(project_dir: Path, paths: List[str]) -> List[Dict[str, Optional[str]]]:
|
|
||||||
"""Generate the file information for a list of paths (dependencies, outputs).
|
|
||||||
Includes the file path and the file's checksum.
|
|
||||||
|
|
||||||
project_dir (Path): The current project directory.
|
|
||||||
paths (List[str]): The file paths.
|
|
||||||
RETURNS (List[Dict[str, str]]): The lockfile entry for a file.
|
|
||||||
"""
|
|
||||||
data = []
|
|
||||||
for path in paths:
|
|
||||||
file_path = project_dir / path
|
|
||||||
md5 = get_checksum(file_path) if file_path.exists() else None
|
|
||||||
data.append({"path": path, "md5": md5})
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|
||||||
"""Checks whether requirements are installed and free of version conflicts.
|
|
||||||
requirements (List[str]): List of requirements.
|
|
||||||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
|
||||||
exist.
|
|
||||||
"""
|
|
||||||
import pkg_resources
|
|
||||||
|
|
||||||
failed_pkgs_msgs: List[str] = []
|
|
||||||
conflicting_pkgs_msgs: List[str] = []
|
|
||||||
|
|
||||||
for req in requirements:
|
|
||||||
try:
|
|
||||||
pkg_resources.require(req)
|
|
||||||
except pkg_resources.DistributionNotFound as dnf:
|
|
||||||
failed_pkgs_msgs.append(dnf.report())
|
|
||||||
except pkg_resources.VersionConflict as vc:
|
|
||||||
conflicting_pkgs_msgs.append(vc.report())
|
|
||||||
except Exception:
|
|
||||||
msg.warn(
|
|
||||||
f"Unable to check requirement: {req} "
|
|
||||||
"Checks are currently limited to requirement specifiers "
|
|
||||||
"(PEP 508)"
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(failed_pkgs_msgs) or len(conflicting_pkgs_msgs):
|
|
||||||
msg.warn(
|
|
||||||
title="Missing requirements or requirement conflicts detected. Make sure your Python environment is set up "
|
|
||||||
"correctly and you installed all requirements specified in your project's requirements.txt: "
|
|
||||||
)
|
|
||||||
for pgk_msg in failed_pkgs_msgs + conflicting_pkgs_msgs:
|
|
||||||
msg.text(pgk_msg)
|
|
||||||
|
|
||||||
return len(failed_pkgs_msgs) > 0, len(conflicting_pkgs_msgs) > 0
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
||||||
can help generate the best possible configuration, given a user's requirements. #}
|
can help generate the best possible configuration, given a user's requirements. #}
|
||||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "span_finder", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||||
[paths]
|
[paths]
|
||||||
train = null
|
train = null
|
||||||
dev = null
|
dev = null
|
||||||
|
@ -28,7 +28,7 @@ lang = "{{ lang }}"
|
||||||
tok2vec/transformer. #}
|
tok2vec/transformer. #}
|
||||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "span_finder" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||||
{%- else -%}
|
{%- else -%}
|
||||||
{%- set full_pipeline = components -%}
|
{%- set full_pipeline = components -%}
|
||||||
|
@ -127,6 +127,30 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
{% endif -%}
|
{% endif -%}
|
||||||
|
|
||||||
|
{% if "span_finder" in components -%}
|
||||||
|
[components.span_finder]
|
||||||
|
factory = "span_finder"
|
||||||
|
max_length = 25
|
||||||
|
min_length = null
|
||||||
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.span_finder.model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[components.span_finder.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec.pooling]
|
||||||
|
@layers = "reduce_mean.v1"
|
||||||
|
{% endif -%}
|
||||||
|
|
||||||
{% if "spancat" in components -%}
|
{% if "spancat" in components -%}
|
||||||
[components.spancat]
|
[components.spancat]
|
||||||
factory = "spancat"
|
factory = "spancat"
|
||||||
|
@ -392,6 +416,27 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
{% if "span_finder" in components %}
|
||||||
|
[components.span_finder]
|
||||||
|
factory = "span_finder"
|
||||||
|
max_length = 25
|
||||||
|
min_length = null
|
||||||
|
scorer = {"@scorers":"spacy.span_finder_scorer.v1"}
|
||||||
|
spans_key = "sc"
|
||||||
|
threshold = 0.5
|
||||||
|
|
||||||
|
[components.span_finder.model]
|
||||||
|
@architectures = "spacy.SpanFinder.v1"
|
||||||
|
|
||||||
|
[components.span_finder.model.scorer]
|
||||||
|
@layers = "spacy.LinearLogistic.v1"
|
||||||
|
nO = 2
|
||||||
|
|
||||||
|
[components.span_finder.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% if "spancat" in components %}
|
{% if "spancat" in components %}
|
||||||
[components.spancat]
|
[components.spancat]
|
||||||
factory = "spancat"
|
factory = "spancat"
|
||||||
|
|
|
@ -1,15 +1,23 @@
|
||||||
from typing import Optional, Dict, Any, Union
|
|
||||||
from pathlib import Path
|
|
||||||
from wasabi import msg
|
|
||||||
import typer
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
|
|
||||||
from ._util import import_code, setup_gpu
|
|
||||||
from ..training.loop import train as train_nlp
|
|
||||||
from ..training.initialize import init_nlp
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..training.initialize import init_nlp
|
||||||
|
from ..training.loop import train as train_nlp
|
||||||
|
from ._util import (
|
||||||
|
Arg,
|
||||||
|
Opt,
|
||||||
|
app,
|
||||||
|
import_code,
|
||||||
|
parse_config_overrides,
|
||||||
|
setup_gpu,
|
||||||
|
show_validation_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
|
|
@ -1,14 +1,21 @@
|
||||||
from typing import Tuple
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
import sys
|
||||||
import requests
|
|
||||||
from wasabi import msg, Printer
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from wasabi import Printer, msg
|
||||||
|
|
||||||
from ._util import app
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..util import get_package_version, get_installed_models, get_minor_version
|
from ..util import (
|
||||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
get_installed_models,
|
||||||
|
get_minor_version,
|
||||||
|
get_model_meta,
|
||||||
|
get_package_path,
|
||||||
|
get_package_version,
|
||||||
|
is_compatible_version,
|
||||||
|
)
|
||||||
|
from ._util import app
|
||||||
|
|
||||||
|
|
||||||
@app.command("validate")
|
@app.command("validate")
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
"""Helpers for Python and platform compatibility."""
|
"""Helpers for Python and platform compatibility."""
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from thinc.util import copy_array
|
from thinc.util import copy_array
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -26,6 +26,9 @@ batch_size = 1000
|
||||||
[nlp.tokenizer]
|
[nlp.tokenizer]
|
||||||
@tokenizers = "spacy.Tokenizer.v1"
|
@tokenizers = "spacy.Tokenizer.v1"
|
||||||
|
|
||||||
|
[nlp.vectors]
|
||||||
|
@vectors = "spacy.Vectors.v1"
|
||||||
|
|
||||||
# The pipeline components and their models
|
# The pipeline components and their models
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
|
|
|
@ -4,15 +4,13 @@ spaCy's built in visualization suite for dependencies and named entities.
|
||||||
DOCS: https://spacy.io/api/top-level#displacy
|
DOCS: https://spacy.io/api/top-level#displacy
|
||||||
USAGE: https://spacy.io/usage/visualizers
|
USAGE: https://spacy.io/usage/visualizers
|
||||||
"""
|
"""
|
||||||
from typing import Union, Iterable, Optional, Dict, Any, Callable
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from typing import Any, Callable, Dict, Iterable, Optional, Union
|
||||||
|
|
||||||
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
|
||||||
from ..tokens import Doc, Span
|
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import is_in_jupyter
|
from ..tokens import Doc, Span
|
||||||
from ..util import find_available_port
|
from ..util import find_available_port, is_in_jupyter
|
||||||
|
from .render import DependencyRenderer, EntityRenderer, SpanRenderer
|
||||||
|
|
||||||
_html = {}
|
_html = {}
|
||||||
RENDER_WRAPPER = None
|
RENDER_WRAPPER = None
|
||||||
|
@ -68,7 +66,7 @@ def render(
|
||||||
if jupyter or (jupyter is None and is_in_jupyter()):
|
if jupyter or (jupyter is None and is_in_jupyter()):
|
||||||
# return HTML rendered by IPython display()
|
# return HTML rendered by IPython display()
|
||||||
# See #4840 for details on span wrapper to disable mathjax
|
# See #4840 for details on span wrapper to disable mathjax
|
||||||
from IPython.core.display import display, HTML
|
from IPython.core.display import HTML, display
|
||||||
|
|
||||||
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
return display(HTML('<span class="tex2jax_ignore">{}</span>'.format(html)))
|
||||||
return html
|
return html
|
||||||
|
|
|
@ -1,15 +1,28 @@
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
||||||
import uuid
|
import uuid
|
||||||
import itertools
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..util import escape_html, minify_html, registry
|
from ..util import escape_html, minify_html, registry
|
||||||
from .templates import TPL_DEP_ARCS, TPL_DEP_SVG, TPL_DEP_WORDS
|
from .templates import (
|
||||||
from .templates import TPL_DEP_WORDS_LEMMA, TPL_ENT, TPL_ENT_RTL, TPL_ENTS
|
TPL_DEP_ARCS,
|
||||||
from .templates import TPL_FIGURE, TPL_KB_LINK, TPL_PAGE, TPL_SPAN
|
TPL_DEP_SVG,
|
||||||
from .templates import TPL_SPAN_RTL, TPL_SPAN_SLICE, TPL_SPAN_SLICE_RTL
|
TPL_DEP_WORDS,
|
||||||
from .templates import TPL_SPAN_START, TPL_SPAN_START_RTL, TPL_SPANS
|
TPL_DEP_WORDS_LEMMA,
|
||||||
from .templates import TPL_TITLE
|
TPL_ENT,
|
||||||
|
TPL_ENT_RTL,
|
||||||
|
TPL_ENTS,
|
||||||
|
TPL_FIGURE,
|
||||||
|
TPL_KB_LINK,
|
||||||
|
TPL_PAGE,
|
||||||
|
TPL_SPAN,
|
||||||
|
TPL_SPAN_RTL,
|
||||||
|
TPL_SPAN_SLICE,
|
||||||
|
TPL_SPAN_SLICE_RTL,
|
||||||
|
TPL_SPAN_START,
|
||||||
|
TPL_SPAN_START_RTL,
|
||||||
|
TPL_SPANS,
|
||||||
|
TPL_TITLE,
|
||||||
|
)
|
||||||
|
|
||||||
DEFAULT_LANG = "en"
|
DEFAULT_LANG = "en"
|
||||||
DEFAULT_DIR = "ltr"
|
DEFAULT_DIR = "ltr"
|
||||||
|
@ -204,7 +217,7 @@ class SpanRenderer:
|
||||||
+ (self.offset_step * (len(entities) - 1))
|
+ (self.offset_step * (len(entities) - 1))
|
||||||
)
|
)
|
||||||
markup += self.span_template.format(
|
markup += self.span_template.format(
|
||||||
text=token["text"],
|
text=escape_html(token["text"]),
|
||||||
span_slices=slices,
|
span_slices=slices,
|
||||||
span_starts=starts,
|
span_starts=starts,
|
||||||
total_height=total_height,
|
total_height=total_height,
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .compat import Literal
|
from .compat import Literal
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,6 +216,9 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
|
||||||
|
W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
|
||||||
|
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||||
|
"'spacy init vectors --attr'")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -549,12 +553,12 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"during training, make sure to include it in 'annotating components'")
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E849 = ("The vocab only supports {method} for vectors of type "
|
||||||
|
"spacy.vectors.Vectors, not {vectors_type}.")
|
||||||
E850 = ("The PretrainVectors objective currently only supports default or "
|
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||||
"floret vectors, not {mode} vectors.")
|
"floret vectors, not {mode} vectors.")
|
||||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||||
"but found value of '{val}'.")
|
"but found value of '{val}'.")
|
||||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
|
||||||
"traversal.")
|
|
||||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
"not permitted in factory names.")
|
"not permitted in factory names.")
|
||||||
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
E854 = ("Unable to set doc.ents. Check that the 'ents_filter' does not "
|
||||||
|
@ -738,8 +742,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
"model from a shortcut, which is obsolete as of spaCy v3.0. To "
|
||||||
"load the model, use its full name instead:\n\n"
|
"load the model, use its full name instead:\n\n"
|
||||||
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
"nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
|
||||||
"models, see the models directory: https://spacy.io/models. If you "
|
"models, see the models directory: https://spacy.io/models and if "
|
||||||
"want to create a blank model, use spacy.blank: "
|
"you want to create a blank model, use spacy.blank: "
|
||||||
"nlp = spacy.blank(\"{name}\")")
|
"nlp = spacy.blank(\"{name}\")")
|
||||||
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
E942 = ("Executing `after_{name}` callback failed. Expected the function to "
|
||||||
"return an initialized nlp object but got: {value}. Maybe "
|
"return an initialized nlp object but got: {value}. Maybe "
|
||||||
|
@ -970,6 +974,15 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||||
|
E1052 = ("Unable to copy spans: the character offsets for the span at "
|
||||||
|
"index {i} in the span group do not align with the tokenization "
|
||||||
|
"in the target doc.")
|
||||||
|
E1053 = ("Both 'min_length' and 'max_length' should be larger than 0, but found"
|
||||||
|
" 'min_length': {min_length}, 'max_length': {max_length}")
|
||||||
|
E1054 = ("The text, including whitespace, must match between reference and "
|
||||||
|
"predicted docs when training {component}.")
|
||||||
|
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||||
|
"but only callbacks with one or three parameters are supported")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .errors import Warnings
|
from .errors import Warnings
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
from .kb cimport KnowledgeBase
|
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from ..typedefs cimport hash_t
|
|
||||||
|
|
||||||
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
from ..typedefs cimport hash_t
|
||||||
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
|
# Object used by the Entity Linker that summarizes one entity-alias candidate
|
||||||
|
# combination.
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
cdef readonly KnowledgeBase kb
|
cdef readonly KnowledgeBase kb
|
||||||
cdef hash_t entity_hash
|
cdef hash_t entity_hash
|
||||||
|
|
|
@ -1,19 +1,31 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
|
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
"""A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved
|
"""A `Candidate` object refers to a textual mention (`alias`) that may or
|
||||||
to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking
|
may not be resolved to a specific `entity` from a Knowledge Base. This
|
||||||
algorithm which will disambiguate the various candidates to the correct one.
|
will be used as input for the entity linking algorithm which will
|
||||||
|
disambiguate the various candidates to the correct one.
|
||||||
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
Each candidate (alias, entity) pair is assigned a certain prior probability.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb/#candidate-init
|
DOCS: https://spacy.io/api/kb/#candidate-init
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):
|
def __init__(
|
||||||
|
self,
|
||||||
|
KnowledgeBase kb,
|
||||||
|
entity_hash,
|
||||||
|
entity_freq,
|
||||||
|
entity_vector,
|
||||||
|
alias_hash,
|
||||||
|
prior_prob
|
||||||
|
):
|
||||||
self.kb = kb
|
self.kb = kb
|
||||||
self.entity_hash = entity_hash
|
self.entity_hash = entity_hash
|
||||||
self.entity_freq = entity_freq
|
self.entity_freq = entity_freq
|
||||||
|
@ -56,7 +68,8 @@ cdef class Candidate:
|
||||||
|
|
||||||
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for a given mention and fetching appropriate entries from the index.
|
Return candidate entities for a given mention and fetching appropriate
|
||||||
|
entries from the index.
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
mention (Span): Entity mention for which to identify candidates.
|
mention (Span): Entity mention for which to identify candidates.
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
|
@ -64,9 +77,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
|
||||||
return kb.get_candidates(mention)
|
return kb.get_candidates(mention)
|
||||||
|
|
||||||
|
|
||||||
def get_candidates_batch(kb: KnowledgeBase, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(
|
||||||
|
kb: KnowledgeBase, mentions: Iterable[Span]
|
||||||
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for the given mentions and fetching appropriate entries from the index.
|
Return candidate entities for the given mentions and fetching appropriate entries
|
||||||
|
from the index.
|
||||||
kb (KnowledgeBase): Knowledge base to query.
|
kb (KnowledgeBase): Knowledge base to query.
|
||||||
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
mention (Iterable[Span]): Entity mentions for which to identify candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
|
|
@ -2,8 +2,10 @@
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
|
|
@ -2,17 +2,19 @@
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Tuple, Union
|
from typing import Iterable, Tuple, Union
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .candidate import Candidate
|
from ..errors import Errors
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
from ..util import SimpleFrozenList
|
from ..util import SimpleFrozenList
|
||||||
from ..errors import Errors
|
from .candidate import Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class KnowledgeBase:
|
cdef class KnowledgeBase:
|
||||||
"""A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases,
|
"""A `KnowledgeBase` instance stores unique identifiers for entities and
|
||||||
to support entity linking of named entities to real-world concepts.
|
their textual aliases, to support entity linking of named entities to
|
||||||
|
real-world concepts.
|
||||||
This is an abstract class and requires its operations to be implemented.
|
This is an abstract class and requires its operations to be implemented.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/kb
|
DOCS: https://spacy.io/api/kb
|
||||||
|
@ -30,10 +32,13 @@ cdef class KnowledgeBase:
|
||||||
self.entity_vector_length = entity_vector_length
|
self.entity_vector_length = entity_vector_length
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
|
||||||
def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
|
def get_candidates_batch(
|
||||||
|
self, mentions: Iterable[Span]
|
||||||
|
) -> Iterable[Iterable[Candidate]]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified texts. Each candidate defines the entity, the original alias,
|
Return candidate entities for specified texts. Each candidate defines
|
||||||
and the prior probability of that alias resolving to that entity.
|
the entity, the original alias, and the prior probability of that
|
||||||
|
alias resolving to that entity.
|
||||||
If no candidate is found for a given text, an empty list is returned.
|
If no candidate is found for a given text, an empty list is returned.
|
||||||
mentions (Iterable[Span]): Mentions for which to get candidates.
|
mentions (Iterable[Span]): Mentions for which to get candidates.
|
||||||
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
|
||||||
|
@ -42,14 +47,17 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
Return candidate entities for specified text. Each candidate defines
|
||||||
|
the entity, the original alias,
|
||||||
and the prior probability of that alias resolving to that entity.
|
and the prior probability of that alias resolving to that entity.
|
||||||
If the no candidate is found for a given text, an empty list is returned.
|
If the no candidate is found for a given text, an empty list is returned.
|
||||||
mention (Span): Mention for which to get candidates.
|
mention (Span): Mention for which to get candidates.
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
RETURNS (Iterable[Candidate]): Identified candidates.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="get_candidates", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||||
|
@ -67,7 +75,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (Iterable[float]): Vector for specified entity.
|
RETURNS (Iterable[float]): Vector for specified entity.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_vector", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="get_vector", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_bytes(self, **kwargs) -> bytes:
|
def to_bytes(self, **kwargs) -> bytes:
|
||||||
|
@ -75,7 +85,9 @@ cdef class KnowledgeBase:
|
||||||
RETURNS (bytes): Current state as binary string.
|
RETURNS (bytes): Current state as binary string.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="to_bytes", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="to_bytes", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
def from_bytes(self, bytes_data: bytes, *, exclude: Tuple[str] = tuple()):
|
||||||
|
@ -84,25 +96,35 @@ cdef class KnowledgeBase:
|
||||||
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
exclude (Tuple[str]): Properties to exclude when restoring KB.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_bytes", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="from_bytes", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def to_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
def to_disk(
|
||||||
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Write KnowledgeBase content to disk.
|
Write KnowledgeBase content to disk.
|
||||||
path (Union[str, Path]): Target file path.
|
path (Union[str, Path]): Target file path.
|
||||||
exclude (Iterable[str]): List of components to exclude.
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="to_disk", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="to_disk", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def from_disk(self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()) -> None:
|
def from_disk(
|
||||||
|
self, path: Union[str, Path], exclude: Iterable[str] = SimpleFrozenList()
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Load KnowledgeBase content from disk.
|
Load KnowledgeBase content from disk.
|
||||||
path (Union[str, Path]): Target file path.
|
path (Union[str, Path]): Target file path.
|
||||||
exclude (Iterable[str]): List of components to exclude.
|
exclude (Iterable[str]): List of components to exclude.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="from_disk", name=self.__name__)
|
Errors.E1045.format(
|
||||||
|
parent="KnowledgeBase", method="from_disk", name=self.__name__
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
"""Knowledge-base for entity or concept linking."""
|
"""Knowledge-base for entity or concept linking."""
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from libc.stdint cimport int32_t, int64_t
|
from libc.stdint cimport int32_t, int64_t
|
||||||
from libc.stdio cimport FILE
|
from libc.stdio cimport FILE
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
|
from ..structs cimport AliasC, KBEntryC
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..structs cimport KBEntryC, AliasC
|
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
ctypedef vector[KBEntryC] entry_vec
|
ctypedef vector[KBEntryC] entry_vec
|
||||||
|
@ -55,23 +55,28 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
# optional data, we can let users configure a DB as the backend for this.
|
# optional data, we can let users configure a DB as the backend for this.
|
||||||
cdef object _features_table
|
cdef object _features_table
|
||||||
|
|
||||||
|
|
||||||
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil:
|
||||||
"""Add an entity vector to the vectors table."""
|
"""Add an entity vector to the vectors table."""
|
||||||
cdef int64_t new_index = self._vectors_table.size()
|
cdef int64_t new_index = self._vectors_table.size()
|
||||||
self._vectors_table.push_back(entity_vector)
|
self._vectors_table.push_back(entity_vector)
|
||||||
return new_index
|
return new_index
|
||||||
|
|
||||||
|
cdef inline int64_t c_add_entity(
|
||||||
cdef inline int64_t c_add_entity(self, hash_t entity_hash, float freq,
|
self,
|
||||||
int32_t vector_index, int feats_row) nogil:
|
hash_t entity_hash,
|
||||||
|
float freq,
|
||||||
|
int32_t vector_index,
|
||||||
|
int feats_row
|
||||||
|
) nogil:
|
||||||
"""Add an entry to the vector of entries.
|
"""Add an entry to the vector of entries.
|
||||||
After calling this method, make sure to update also the _entry_index using the return value"""
|
After calling this method, make sure to update also the _entry_index
|
||||||
|
using the return value"""
|
||||||
# This is what we'll map the entity hash key to. It's where the entry will sit
|
# This is what we'll map the entity hash key to. It's where the entry will sit
|
||||||
# in the vector of entries, so we can get it later.
|
# in the vector of entries, so we can get it later.
|
||||||
cdef int64_t new_index = self._entries.size()
|
cdef int64_t new_index = self._entries.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642
|
# Avoid struct initializer to enable nogil, cf.
|
||||||
|
# https://github.com/cython/cython/issues/1642
|
||||||
cdef KBEntryC entry
|
cdef KBEntryC entry
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.vector_index = vector_index
|
entry.vector_index = vector_index
|
||||||
|
@ -81,11 +86,17 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
self._entries.push_back(entry)
|
self._entries.push_back(entry)
|
||||||
return new_index
|
return new_index
|
||||||
|
|
||||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil:
|
cdef inline int64_t c_add_aliases(
|
||||||
"""Connect a mention to a list of potential entities with their prior probabilities .
|
self,
|
||||||
After calling this method, make sure to update also the _alias_index using the return value"""
|
hash_t alias_hash,
|
||||||
# This is what we'll map the alias hash key to. It's where the alias will be defined
|
vector[int64_t] entry_indices,
|
||||||
# in the vector of aliases.
|
vector[float] probs
|
||||||
|
) nogil:
|
||||||
|
"""Connect a mention to a list of potential entities with their prior
|
||||||
|
probabilities. After calling this method, make sure to update also the
|
||||||
|
_alias_index using the return value"""
|
||||||
|
# This is what we'll map the alias hash key to. It's where the alias will be
|
||||||
|
# defined in the vector of aliases.
|
||||||
cdef int64_t new_index = self._aliases_table.size()
|
cdef int64_t new_index = self._aliases_table.size()
|
||||||
|
|
||||||
# Avoid struct initializer to enable nogil
|
# Avoid struct initializer to enable nogil
|
||||||
|
@ -98,8 +109,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil:
|
||||||
"""
|
"""
|
||||||
Initializing the vectors and making sure the first element of each vector is a dummy,
|
Initializing the vectors and making sure the first element of each vector is a
|
||||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
dummy, because the PreshMap maps pointing to indices in these vectors can not
|
||||||
|
contain 0 as value.
|
||||||
cf. https://github.com/explosion/preshed/issues/17
|
cf. https://github.com/explosion/preshed/issues/17
|
||||||
"""
|
"""
|
||||||
cdef int32_t dummy_value = 0
|
cdef int32_t dummy_value = 0
|
||||||
|
@ -130,12 +142,18 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
cdef class Writer:
|
cdef class Writer:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1
|
cdef int write_header(
|
||||||
|
self, int64_t nr_entries, int64_t entity_vector_length
|
||||||
|
) except -1
|
||||||
cdef int write_vector_element(self, float element) except -1
|
cdef int write_vector_element(self, float element) except -1
|
||||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1
|
cdef int write_entry(
|
||||||
|
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||||
|
) except -1
|
||||||
|
|
||||||
cdef int write_alias_length(self, int64_t alias_length) except -1
|
cdef int write_alias_length(self, int64_t alias_length) except -1
|
||||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1
|
cdef int write_alias_header(
|
||||||
|
self, hash_t alias_hash, int64_t candidate_length
|
||||||
|
) except -1
|
||||||
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
cdef int write_alias(self, int64_t entry_index, float prob) except -1
|
||||||
|
|
||||||
cdef int _write(self, void* value, size_t size) except -1
|
cdef int _write(self, void* value, size_t size) except -1
|
||||||
|
@ -143,12 +161,18 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
cdef FILE* _fp
|
cdef FILE* _fp
|
||||||
|
|
||||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1
|
cdef int read_header(
|
||||||
|
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||||
|
) except -1
|
||||||
cdef int read_vector_element(self, float* element) except -1
|
cdef int read_vector_element(self, float* element) except -1
|
||||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1
|
cdef int read_entry(
|
||||||
|
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||||
|
) except -1
|
||||||
|
|
||||||
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
cdef int read_alias_length(self, int64_t* alias_length) except -1
|
||||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1
|
cdef int read_alias_header(
|
||||||
|
self, hash_t* alias_hash, int64_t* candidate_length
|
||||||
|
) except -1
|
||||||
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
cdef int read_alias(self, int64_t* entry_index, float* prob) except -1
|
||||||
|
|
||||||
cdef int _read(self, void* value, size_t size) except -1
|
cdef int _read(self, void* value, size_t size) except -1
|
||||||
|
|
|
@ -1,29 +1,35 @@
|
||||||
# cython: infer_types=True, profile=True
|
# cython: infer_types=True, profile=True
|
||||||
from typing import Iterable, Callable, Dict, Any, Union
|
from typing import Any, Callable, Dict, Iterable
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from preshed.maps cimport PreshMap
|
|
||||||
from cpython.exc cimport PyErr_SetFromErrno
|
|
||||||
from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek
|
|
||||||
from libc.stdint cimport int32_t, int64_t
|
|
||||||
from libcpp.vector cimport vector
|
|
||||||
|
|
||||||
from pathlib import Path
|
from cpython.exc cimport PyErr_SetFromErrno
|
||||||
|
from libc.stdint cimport int32_t, int64_t
|
||||||
|
from libc.stdio cimport fclose, feof, fopen, fread, fseek, fwrite
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from ..tokens import Span
|
from ..tokens import Span
|
||||||
|
|
||||||
from ..typedefs cimport hash_t
|
from ..typedefs cimport hash_t
|
||||||
from ..errors import Errors, Warnings
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
from ..util import SimpleFrozenList, ensure_path
|
from ..util import SimpleFrozenList, ensure_path
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from .kb cimport KnowledgeBase
|
from .kb cimport KnowledgeBase
|
||||||
|
|
||||||
from .candidate import Candidate as Candidate
|
from .candidate import Candidate as Candidate
|
||||||
|
|
||||||
|
|
||||||
cdef class InMemoryLookupKB(KnowledgeBase):
|
cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
|
"""An `InMemoryLookupKB` instance stores unique identifiers for entities
|
||||||
to support entity linking of named entities to real-world concepts.
|
and their textual aliases, to support entity linking of named entities to
|
||||||
|
real-world concepts.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/inmemorylookupkb
|
DOCS: https://spacy.io/api/inmemorylookupkb
|
||||||
"""
|
"""
|
||||||
|
@ -66,7 +72,8 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
def add_entity(self, str entity, float freq, vector[float] entity_vector):
|
||||||
"""
|
"""
|
||||||
Add an entity to the KB, optionally specifying its log probability based on corpus frequency
|
Add an entity to the KB, optionally specifying its log probability
|
||||||
|
based on corpus frequency.
|
||||||
Return the hash of the entity ID/name at the end.
|
Return the hash of the entity ID/name at the end.
|
||||||
"""
|
"""
|
||||||
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
cdef hash_t entity_hash = self.vocab.strings.add(entity)
|
||||||
|
@ -78,14 +85,20 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
# Raise an error if the provided entity vector is not of the correct length
|
# Raise an error if the provided entity vector is not of the correct length
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
raise ValueError(
|
||||||
|
Errors.E141.format(
|
||||||
|
found=len(entity_vector), required=self.entity_vector_length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
vector_index = self.c_add_vector(entity_vector=entity_vector)
|
||||||
|
|
||||||
new_index = self.c_add_entity(entity_hash=entity_hash,
|
new_index = self.c_add_entity(
|
||||||
|
entity_hash=entity_hash,
|
||||||
freq=freq,
|
freq=freq,
|
||||||
vector_index=vector_index,
|
vector_index=vector_index,
|
||||||
feats_row=-1) # Features table currently not implemented
|
feats_row=-1
|
||||||
|
) # Features table currently not implemented
|
||||||
self._entry_index[entity_hash] = new_index
|
self._entry_index[entity_hash] = new_index
|
||||||
|
|
||||||
return entity_hash
|
return entity_hash
|
||||||
|
@ -110,7 +123,12 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
else:
|
else:
|
||||||
entity_vector = vector_list[i]
|
entity_vector = vector_list[i]
|
||||||
if len(entity_vector) != self.entity_vector_length:
|
if len(entity_vector) != self.entity_vector_length:
|
||||||
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
|
raise ValueError(
|
||||||
|
Errors.E141.format(
|
||||||
|
found=len(entity_vector),
|
||||||
|
required=self.entity_vector_length
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
entry.entity_hash = entity_hash
|
entry.entity_hash = entity_hash
|
||||||
entry.freq = freq_list[i]
|
entry.freq = freq_list[i]
|
||||||
|
@ -144,11 +162,15 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
previous_alias_nr = self.get_size_aliases()
|
previous_alias_nr = self.get_size_aliases()
|
||||||
# Throw an error if the length of entities and probabilities are not the same
|
# Throw an error if the length of entities and probabilities are not the same
|
||||||
if not len(entities) == len(probabilities):
|
if not len(entities) == len(probabilities):
|
||||||
raise ValueError(Errors.E132.format(alias=alias,
|
raise ValueError(
|
||||||
|
Errors.E132.format(
|
||||||
|
alias=alias,
|
||||||
entities_length=len(entities),
|
entities_length=len(entities),
|
||||||
probabilities_length=len(probabilities)))
|
probabilities_length=len(probabilities))
|
||||||
|
)
|
||||||
|
|
||||||
# Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors)
|
# Throw an error if the probabilities sum up to more than 1 (allow for
|
||||||
|
# some rounding errors)
|
||||||
prob_sum = sum(probabilities)
|
prob_sum = sum(probabilities)
|
||||||
if prob_sum > 1.00001:
|
if prob_sum > 1.00001:
|
||||||
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum))
|
||||||
|
@ -165,40 +187,47 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
for entity, prob in zip(entities, probabilities):
|
for entity, prob in zip(entities, probabilities):
|
||||||
entity_hash = self.vocab.strings[entity]
|
entity_hash = self.vocab.strings[entity]
|
||||||
if not entity_hash in self._entry_index:
|
if entity_hash not in self._entry_index:
|
||||||
raise ValueError(Errors.E134.format(entity=entity))
|
raise ValueError(Errors.E134.format(entity=entity))
|
||||||
|
|
||||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||||
entry_indices.push_back(int(entry_index))
|
entry_indices.push_back(int(entry_index))
|
||||||
probs.push_back(float(prob))
|
probs.push_back(float(prob))
|
||||||
|
|
||||||
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
|
new_index = self.c_add_aliases(
|
||||||
|
alias_hash=alias_hash, entry_indices=entry_indices, probs=probs
|
||||||
|
)
|
||||||
self._alias_index[alias_hash] = new_index
|
self._alias_index[alias_hash] = new_index
|
||||||
|
|
||||||
if previous_alias_nr + 1 != self.get_size_aliases():
|
if previous_alias_nr + 1 != self.get_size_aliases():
|
||||||
raise RuntimeError(Errors.E891.format(alias=alias))
|
raise RuntimeError(Errors.E891.format(alias=alias))
|
||||||
return alias_hash
|
return alias_hash
|
||||||
|
|
||||||
def append_alias(self, str alias, str entity, float prior_prob, ignore_warnings=False):
|
def append_alias(
|
||||||
|
self, str alias, str entity, float prior_prob, ignore_warnings=False
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
For an alias already existing in the KB, extend its potential entities with one more.
|
For an alias already existing in the KB, extend its potential entities
|
||||||
|
with one more.
|
||||||
Throw a warning if either the alias or the entity is unknown,
|
Throw a warning if either the alias or the entity is unknown,
|
||||||
or when the combination is already previously recorded.
|
or when the combination is already previously recorded.
|
||||||
Throw an error if this entity+prior prob would exceed the sum of 1.
|
Throw an error if this entity+prior prob would exceed the sum of 1.
|
||||||
For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
|
For efficiency, it's best to use the method `add_alias` as much as
|
||||||
|
possible instead of this one.
|
||||||
"""
|
"""
|
||||||
# Check if the alias exists in the KB
|
# Check if the alias exists in the KB
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
if not alias_hash in self._alias_index:
|
if alias_hash not in self._alias_index:
|
||||||
raise ValueError(Errors.E176.format(alias=alias))
|
raise ValueError(Errors.E176.format(alias=alias))
|
||||||
|
|
||||||
# Check if the entity exists in the KB
|
# Check if the entity exists in the KB
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
if not entity_hash in self._entry_index:
|
if entity_hash not in self._entry_index:
|
||||||
raise ValueError(Errors.E134.format(entity=entity))
|
raise ValueError(Errors.E134.format(entity=entity))
|
||||||
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
entry_index = <int64_t>self._entry_index.get(entity_hash)
|
||||||
|
|
||||||
# Throw an error if the prior probabilities (including the new one) sum up to more than 1
|
# Throw an error if the prior probabilities (including the new one)
|
||||||
|
# sum up to more than 1
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
current_sum = sum([p for p in alias_entry.probs])
|
current_sum = sum([p for p in alias_entry.probs])
|
||||||
|
@ -231,12 +260,13 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
|
|
||||||
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
def get_alias_candidates(self, str alias) -> Iterable[Candidate]:
|
||||||
"""
|
"""
|
||||||
Return candidate entities for an alias. Each candidate defines the entity, the original alias,
|
Return candidate entities for an alias. Each candidate defines the
|
||||||
and the prior probability of that alias resolving to that entity.
|
entity, the original alias, and the prior probability of that alias
|
||||||
|
resolving to that entity.
|
||||||
If the alias is not known in the KB, and empty list is returned.
|
If the alias is not known in the KB, and empty list is returned.
|
||||||
"""
|
"""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
if not alias_hash in self._alias_index:
|
if alias_hash not in self._alias_index:
|
||||||
return []
|
return []
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
|
@ -244,10 +274,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
return [Candidate(kb=self,
|
return [Candidate(kb=self,
|
||||||
entity_hash=self._entries[entry_index].entity_hash,
|
entity_hash=self._entries[entry_index].entity_hash,
|
||||||
entity_freq=self._entries[entry_index].freq,
|
entity_freq=self._entries[entry_index].freq,
|
||||||
entity_vector=self._vectors_table[self._entries[entry_index].vector_index],
|
entity_vector=self._vectors_table[
|
||||||
|
self._entries[entry_index].vector_index
|
||||||
|
],
|
||||||
alias_hash=alias_hash,
|
alias_hash=alias_hash,
|
||||||
prior_prob=prior_prob)
|
prior_prob=prior_prob)
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prior_prob) in zip(
|
||||||
|
alias_entry.entry_indices, alias_entry.probs
|
||||||
|
)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
def get_vector(self, str entity):
|
def get_vector(self, str entity):
|
||||||
|
@ -261,8 +295,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
return self._vectors_table[self._entries[entry_index].vector_index]
|
return self._vectors_table[self._entries[entry_index].vector_index]
|
||||||
|
|
||||||
def get_prior_prob(self, str entity, str alias):
|
def get_prior_prob(self, str entity, str alias):
|
||||||
""" Return the prior probability of a given alias being linked to a given entity,
|
""" Return the prior probability of a given alias being linked to a
|
||||||
or return 0.0 when this combination is not known in the knowledge base"""
|
given entity, or return 0.0 when this combination is not known in the
|
||||||
|
knowledge base."""
|
||||||
cdef hash_t alias_hash = self.vocab.strings[alias]
|
cdef hash_t alias_hash = self.vocab.strings[alias]
|
||||||
cdef hash_t entity_hash = self.vocab.strings[entity]
|
cdef hash_t entity_hash = self.vocab.strings[entity]
|
||||||
|
|
||||||
|
@ -273,7 +308,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
entry_index = self._entry_index[entity_hash]
|
entry_index = self._entry_index[entity_hash]
|
||||||
|
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
for (entry_index, prior_prob) in zip(alias_entry.entry_indices, alias_entry.probs):
|
for (entry_index, prior_prob) in zip(
|
||||||
|
alias_entry.entry_indices, alias_entry.probs
|
||||||
|
):
|
||||||
if self._entries[entry_index].entity_hash == entity_hash:
|
if self._entries[entry_index].entity_hash == entity_hash:
|
||||||
return prior_prob
|
return prior_prob
|
||||||
|
|
||||||
|
@ -283,13 +320,19 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
"""
|
"""
|
||||||
def serialize_header():
|
def serialize_header():
|
||||||
header = (self.get_size_entities(), self.get_size_aliases(), self.entity_vector_length)
|
header = (
|
||||||
|
self.get_size_entities(),
|
||||||
|
self.get_size_aliases(),
|
||||||
|
self.entity_vector_length
|
||||||
|
)
|
||||||
return srsly.json_dumps(header)
|
return srsly.json_dumps(header)
|
||||||
|
|
||||||
def serialize_entries():
|
def serialize_entries():
|
||||||
i = 1
|
i = 1
|
||||||
tuples = []
|
tuples = []
|
||||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
for entry_hash, entry_index in sorted(
|
||||||
|
self._entry_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
assert entry.entity_hash == entry_hash
|
assert entry.entity_hash == entry_hash
|
||||||
assert entry_index == i
|
assert entry_index == i
|
||||||
|
@ -302,7 +345,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
headers = []
|
headers = []
|
||||||
indices_lists = []
|
indices_lists = []
|
||||||
probs_lists = []
|
probs_lists = []
|
||||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
for alias_hash, alias_index in sorted(
|
||||||
|
self._alias_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
alias = self._aliases_table[alias_index]
|
alias = self._aliases_table[alias_index]
|
||||||
assert alias_index == i
|
assert alias_index == i
|
||||||
candidate_length = len(alias.entry_indices)
|
candidate_length = len(alias.entry_indices)
|
||||||
|
@ -360,7 +405,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
indices = srsly.json_loads(all_data[1])
|
indices = srsly.json_loads(all_data[1])
|
||||||
probs = srsly.json_loads(all_data[2])
|
probs = srsly.json_loads(all_data[2])
|
||||||
for header, indices, probs in zip(headers, indices, probs):
|
for header, indices, probs in zip(headers, indices, probs):
|
||||||
alias_hash, candidate_length = header
|
alias_hash, _candidate_length = header
|
||||||
alias.entry_indices = indices
|
alias.entry_indices = indices
|
||||||
alias.probs = probs
|
alias.probs = probs
|
||||||
self._aliases_table[i] = alias
|
self._aliases_table[i] = alias
|
||||||
|
@ -409,10 +454,14 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
writer.write_vector_element(element)
|
writer.write_vector_element(element)
|
||||||
i = i+1
|
i = i+1
|
||||||
|
|
||||||
# dumping the entry records in the order in which they are in the _entries vector.
|
# dumping the entry records in the order in which they are in the
|
||||||
# index 0 is a dummy object not stored in the _entry_index and can be ignored.
|
# _entries vector.
|
||||||
|
# index 0 is a dummy object not stored in the _entry_index and can
|
||||||
|
# be ignored.
|
||||||
i = 1
|
i = 1
|
||||||
for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]):
|
for entry_hash, entry_index in sorted(
|
||||||
|
self._entry_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
entry = self._entries[entry_index]
|
entry = self._entries[entry_index]
|
||||||
assert entry.entity_hash == entry_hash
|
assert entry.entity_hash == entry_hash
|
||||||
assert entry_index == i
|
assert entry_index == i
|
||||||
|
@ -424,7 +473,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
||||||
# dumping the aliases in the order in which they are in the _alias_index vector.
|
# dumping the aliases in the order in which they are in the _alias_index vector.
|
||||||
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
# index 0 is a dummy object not stored in the _aliases_table and can be ignored.
|
||||||
i = 1
|
i = 1
|
||||||
for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]):
|
for alias_hash, alias_index in sorted(
|
||||||
|
self._alias_index.items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
alias = self._aliases_table[alias_index]
|
alias = self._aliases_table[alias_index]
|
||||||
assert alias_index == i
|
assert alias_index == i
|
||||||
|
|
||||||
|
@ -530,7 +581,8 @@ cdef class Writer:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
cdef bytes bytes_loc = content.encode('utf8') \
|
||||||
|
if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
raise IOError(Errors.E146.format(path=path))
|
raise IOError(Errors.E146.format(path=path))
|
||||||
|
@ -540,14 +592,18 @@ cdef class Writer:
|
||||||
cdef size_t status = fclose(self._fp)
|
cdef size_t status = fclose(self._fp)
|
||||||
assert status == 0
|
assert status == 0
|
||||||
|
|
||||||
cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1:
|
cdef int write_header(
|
||||||
|
self, int64_t nr_entries, int64_t entity_vector_length
|
||||||
|
) except -1:
|
||||||
self._write(&nr_entries, sizeof(nr_entries))
|
self._write(&nr_entries, sizeof(nr_entries))
|
||||||
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
self._write(&entity_vector_length, sizeof(entity_vector_length))
|
||||||
|
|
||||||
cdef int write_vector_element(self, float element) except -1:
|
cdef int write_vector_element(self, float element) except -1:
|
||||||
self._write(&element, sizeof(element))
|
self._write(&element, sizeof(element))
|
||||||
|
|
||||||
cdef int write_entry(self, hash_t entry_hash, float entry_freq, int32_t vector_index) except -1:
|
cdef int write_entry(
|
||||||
|
self, hash_t entry_hash, float entry_freq, int32_t vector_index
|
||||||
|
) except -1:
|
||||||
self._write(&entry_hash, sizeof(entry_hash))
|
self._write(&entry_hash, sizeof(entry_hash))
|
||||||
self._write(&entry_freq, sizeof(entry_freq))
|
self._write(&entry_freq, sizeof(entry_freq))
|
||||||
self._write(&vector_index, sizeof(vector_index))
|
self._write(&vector_index, sizeof(vector_index))
|
||||||
|
@ -556,7 +612,9 @@ cdef class Writer:
|
||||||
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
cdef int write_alias_length(self, int64_t alias_length) except -1:
|
||||||
self._write(&alias_length, sizeof(alias_length))
|
self._write(&alias_length, sizeof(alias_length))
|
||||||
|
|
||||||
cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1:
|
cdef int write_alias_header(
|
||||||
|
self, hash_t alias_hash, int64_t candidate_length
|
||||||
|
) except -1:
|
||||||
self._write(&alias_hash, sizeof(alias_hash))
|
self._write(&alias_hash, sizeof(alias_hash))
|
||||||
self._write(&candidate_length, sizeof(candidate_length))
|
self._write(&candidate_length, sizeof(candidate_length))
|
||||||
|
|
||||||
|
@ -572,16 +630,19 @@ cdef class Writer:
|
||||||
cdef class Reader:
|
cdef class Reader:
|
||||||
def __init__(self, path):
|
def __init__(self, path):
|
||||||
content = bytes(path)
|
content = bytes(path)
|
||||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == str else content
|
cdef bytes bytes_loc = content.encode('utf8') \
|
||||||
|
if type(content) == str else content
|
||||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||||
if not self._fp:
|
if not self._fp:
|
||||||
PyErr_SetFromErrno(IOError)
|
PyErr_SetFromErrno(IOError)
|
||||||
status = fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
fseek(self._fp, 0, 0) # this can be 0 if there is no header
|
||||||
|
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
fclose(self._fp)
|
fclose(self._fp)
|
||||||
|
|
||||||
cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1:
|
cdef int read_header(
|
||||||
|
self, int64_t* nr_entries, int64_t* entity_vector_length
|
||||||
|
) except -1:
|
||||||
status = self._read(nr_entries, sizeof(int64_t))
|
status = self._read(nr_entries, sizeof(int64_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -601,7 +662,9 @@ cdef class Reader:
|
||||||
return 0 # end of file
|
return 0 # end of file
|
||||||
raise IOError(Errors.E145.format(param="vector element"))
|
raise IOError(Errors.E145.format(param="vector element"))
|
||||||
|
|
||||||
cdef int read_entry(self, hash_t* entity_hash, float* freq, int32_t* vector_index) except -1:
|
cdef int read_entry(
|
||||||
|
self, hash_t* entity_hash, float* freq, int32_t* vector_index
|
||||||
|
) except -1:
|
||||||
status = self._read(entity_hash, sizeof(hash_t))
|
status = self._read(entity_hash, sizeof(hash_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
@ -632,7 +695,9 @@ cdef class Reader:
|
||||||
return 0 # end of file
|
return 0 # end of file
|
||||||
raise IOError(Errors.E145.format(param="alias length"))
|
raise IOError(Errors.E145.format(param="alias length"))
|
||||||
|
|
||||||
cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1:
|
cdef int read_alias_header(
|
||||||
|
self, hash_t* alias_hash, int64_t* candidate_length
|
||||||
|
) except -1:
|
||||||
status = self._read(alias_hash, sizeof(hash_t))
|
status = self._read(alias_hash, sizeof(hash_t))
|
||||||
if status < 1:
|
if status < 1:
|
||||||
if feof(self._fp):
|
if feof(self._fp):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class AfrikaansDefaults(BaseDefaults):
|
class AfrikaansDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...attrs import LANG
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class AmharicDefaults(BaseDefaults):
|
class AmharicDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
_list_punct = LIST_PUNCT + "፡ ። ፣ ፤ ፥ ፦ ፧ ፠ ፨".strip().split()
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class ArabicDefaults(BaseDefaults):
|
class ArabicDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,11 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import UNITS, ALPHA_UPPER
|
ALPHA_UPPER,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class AzerbaijaniDefaults(BaseDefaults):
|
class AzerbaijaniDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Eleven, twelve etc. are written separate: on bir, on iki
|
# Eleven, twelve etc. are written separate: on bir, on iki
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
|
from ...attrs import LANG
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..punctuation import (
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_INFIXES,
|
||||||
|
COMBINING_DIACRITICS_TOKENIZER_SUFFIXES,
|
||||||
|
)
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_INFIXES
|
|
||||||
from ..punctuation import COMBINING_DIACRITICS_TOKENIZER_SUFFIXES
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...attrs import LANG
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
|
||||||
class BulgarianDefaults(BaseDefaults):
|
class BulgarianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"нула",
|
"нула",
|
||||||
"едно",
|
"едно",
|
||||||
|
|
|
@ -4,8 +4,7 @@ References:
|
||||||
(countries, occupations, fields of studies and more).
|
(countries, occupations, fields of studies and more).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import NORM, ORTH
|
||||||
|
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class BengaliDefaults(BaseDefaults):
|
class BengaliDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿৳"
|
_currency = r"\$¢£€¥฿৳"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
from .lemmatizer import CatalanLemmatizer
|
from .lemmatizer import CatalanLemmatizer
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class CatalanDefaults(BaseDefaults):
|
class CatalanDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"zero",
|
"zero",
|
||||||
"un",
|
"un",
|
||||||
|
|
|
@ -1,9 +1,18 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_CURRENCY
|
ALPHA,
|
||||||
from ..char_classes import CURRENCY
|
ALPHA_LOWER,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
ALPHA_UPPER,
|
||||||
from ..char_classes import merge_chars, _units
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
_units,
|
||||||
|
merge_chars,
|
||||||
|
)
|
||||||
|
|
||||||
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PROPN
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
class CzechDefaults(BaseDefaults):
|
class CzechDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class DanishDefaults(BaseDefaults):
|
class DanishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
# Source http://fjern-uv.dk/tal.php
|
# Source http://fjern-uv.dk/tal.php
|
||||||
_num_words = """nul
|
_num_words = """nul
|
||||||
en et to tre fire fem seks syv otte ni ti
|
en et to tre fire fem seks syv otte ni ti
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
from ..char_classes import (
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
)
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
from ..punctuation import TOKENIZER_SUFFIXES
|
||||||
|
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
from ...tokens import Doc, Span
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import AUX, NOUN, PRON, PROPN, VERB
|
||||||
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||||
|
|
|
@ -2,10 +2,9 @@
|
||||||
Tokenizer Exceptions.
|
Tokenizer Exceptions.
|
||||||
Source: https://forkortelse.dk/ and various others.
|
Source: https://forkortelse.dk/ and various others.
|
||||||
"""
|
"""
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from ...language import Language, BaseDefaults
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class GermanDefaults(BaseDefaults):
|
class GermanDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,9 +1,18 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
from ..char_classes import (
|
||||||
from ..char_classes import CURRENCY, UNITS, PUNCT
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
PUNCT,
|
||||||
|
UNITS,
|
||||||
|
)
|
||||||
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
_prefixes = ["``"] + BASE_TOKENIZER_PREFIXES
|
||||||
|
|
||||||
_suffixes = (
|
_suffixes = (
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
"auf'm": [{ORTH: "auf"}, {ORTH: "'m", NORM: "dem"}],
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from ...language import Language, BaseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
class LowerSorbianDefaults(BaseDefaults):
|
class LowerSorbianDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
|
||||||
from .lemmatizer import GreekLemmatizer
|
from .lemmatizer import GreekLemmatizer
|
||||||
from ...language import Language, BaseDefaults
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class GreekDefaults(BaseDefaults):
|
class GreekDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
def get_pos_from_wiktionary():
|
def get_pos_from_wiktionary():
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from gensim.corpora.wikicorpus import extract_pages
|
from gensim.corpora.wikicorpus import extract_pages
|
||||||
|
|
||||||
regex = re.compile(r"==={{(\w+)\|el}}===")
|
regex = re.compile(r"==={{(\w+)\|el}}===")
|
||||||
|
|
|
@ -1,6 +1,16 @@
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from ..char_classes import (
|
||||||
from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS
|
ALPHA,
|
||||||
from ..char_classes import CONCAT_QUOTES, CURRENCY
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
CURRENCY,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_CURRENCY,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
LIST_PUNCT,
|
||||||
|
LIST_QUOTES,
|
||||||
|
)
|
||||||
|
|
||||||
_units = (
|
_units = (
|
||||||
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
"km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft "
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ...symbols import NORM, ORTH
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc = {}
|
_exc = {}
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from ...language import BaseDefaults, Language
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
from .lemmatizer import EnglishLemmatizer
|
from .lemmatizer import EnglishLemmatizer
|
||||||
from ...language import Language, BaseDefaults
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class EnglishDefaults(BaseDefaults):
|
class EnglishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,12 @@
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
from ..char_classes import (
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
ALPHA,
|
||||||
|
ALPHA_LOWER,
|
||||||
|
ALPHA_UPPER,
|
||||||
|
CONCAT_QUOTES,
|
||||||
|
HYPHENS,
|
||||||
|
LIST_ELLIPSES,
|
||||||
|
LIST_ICONS,
|
||||||
|
)
|
||||||
|
|
||||||
_infixes = (
|
_infixes = (
|
||||||
LIST_ELLIPSES
|
LIST_ELLIPSES
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from typing import Union, Iterator, Tuple
|
from typing import Iterator, Tuple, Union
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from ...symbols import NOUN, PRON, PROPN
|
||||||
from ...tokens import Doc, Span
|
from ...tokens import Doc, Span
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
from ...symbols import ORTH, NORM
|
|
||||||
from ...util import update_exc
|
|
||||||
|
|
||||||
|
from ...symbols import NORM, ORTH
|
||||||
|
from ...util import update_exc
|
||||||
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
|
|
||||||
_exc: Dict[str, List[Dict]] = {}
|
_exc: Dict[str, List[Dict]] = {}
|
||||||
_exclude = [
|
_exclude = [
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from typing import Optional, Callable
|
from typing import Callable, Optional
|
||||||
|
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .stop_words import STOP_WORDS
|
from ...language import BaseDefaults, Language
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .lemmatizer import SpanishLemmatizer
|
from .lemmatizer import SpanishLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language, BaseDefaults
|
from .stop_words import STOP_WORDS
|
||||||
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
class SpanishDefaults(BaseDefaults):
|
class SpanishDefaults(BaseDefaults):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from typing import List, Optional, Tuple
|
|
||||||
import re
|
import re
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
from ...pipeline import Lemmatizer
|
||||||
from ...tokens import Token
|
from ...tokens import Token
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"cero",
|
"cero",
|
||||||
"uno",
|
"uno",
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user