mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 17:41:59 +03:00
Merge branch 'master' into test-cli-app-init-config
This commit is contained in:
commit
add6de2fa9
119
.github/azure-steps.yml
vendored
119
.github/azure-steps.yml
vendored
|
@ -1,119 +0,0 @@
|
|||
parameters:
|
||||
python_version: ''
|
||||
architecture: 'x64'
|
||||
num_build_jobs: 2
|
||||
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: ${{ parameters.python_version }}
|
||||
architecture: ${{ parameters.architecture }}
|
||||
allowUnstable: true
|
||||
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=python_version]${{ parameters.python_version }}"
|
||||
displayName: 'Set variables'
|
||||
|
||||
- script: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install dependencies"
|
||||
|
||||
- script: |
|
||||
python -m build --sdist
|
||||
displayName: "Build sdist"
|
||||
|
||||
- script: |
|
||||
python -m mypy spacy
|
||||
displayName: 'Run mypy'
|
||||
condition: ne(variables['python_version'], '3.6')
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "spacy"
|
||||
displayName: "Delete source directory"
|
||||
|
||||
- task: DeleteFiles@1
|
||||
inputs:
|
||||
contents: "*.egg-info"
|
||||
displayName: "Delete egg-info directory"
|
||||
|
||||
- script: |
|
||||
python -m pip freeze > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
displayName: "Uninstall all packages"
|
||||
|
||||
- bash: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=${{ parameters.num_build_jobs }} python -m pip install dist/$SDIST
|
||||
displayName: "Install from sdist"
|
||||
|
||||
- script: |
|
||||
python -W error -c "import spacy"
|
||||
displayName: "Test import"
|
||||
|
||||
- script: |
|
||||
python -m spacy download ca_core_news_sm
|
||||
python -m spacy download ca_core_news_md
|
||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
displayName: 'Test download CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
displayName: 'Test no warnings on load (#11713)'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
displayName: 'Test convert CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
displayName: 'Test debug config CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
displayName: 'Test debug data CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
displayName: 'Test train CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
displayName: 'Test assemble CLI'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
displayName: 'Test assemble CLI vectors warning'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
||||
- script: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
displayName: "Install test requirements"
|
||||
|
||||
- script: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
displayName: "Run CPU tests"
|
||||
|
||||
- script: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
displayName: "Run CPU tests with thinc-apple-ops"
|
||||
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
|
||||
|
||||
- script: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
||||
displayName: 'Test website/meta/universe.json'
|
||||
condition: eq(variables['python_version'], '3.8')
|
||||
|
45
.github/workflows/autoblack.yml
vendored
45
.github/workflows/autoblack.yml
vendored
|
@ -1,45 +0,0 @@
|
|||
# GitHub Action that uses Black to reformat all Python code and submits a PR
|
||||
# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
|
||||
|
||||
name: autoblack
|
||||
on:
|
||||
workflow_dispatch: # allow manual trigger
|
||||
schedule:
|
||||
- cron: '0 8 * * 5' # every Friday at 8am UTC
|
||||
|
||||
jobs:
|
||||
autoblack:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
ref: ${{ github.head_ref }}
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install black
|
||||
- name: Auto-format code if needed
|
||||
run: black spacy
|
||||
# We can't run black --check here because that returns a non-zero excit
|
||||
# code and makes GitHub think the action failed
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo modified=$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
uses: peter-evans/create-pull-request@v4
|
||||
with:
|
||||
title: Auto-format code with black
|
||||
labels: meta
|
||||
commit-message: Auto-format code with black
|
||||
committer: GitHub <noreply@github.com>
|
||||
author: explosion-bot <explosion-bot@users.noreply.github.com>
|
||||
body: _This PR is auto-generated._
|
||||
branch: autoblack
|
||||
delete-branch: true
|
||||
draft: false
|
||||
- name: Check outputs
|
||||
if: steps.git-check.outputs.modified == 'true'
|
||||
run: |
|
||||
echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
|
||||
echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
|
1
.github/workflows/explosionbot.yml
vendored
1
.github/workflows/explosionbot.yml
vendored
|
@ -8,6 +8,7 @@ on:
|
|||
|
||||
jobs:
|
||||
explosion-bot:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Dump GitHub context
|
||||
|
|
1
.github/workflows/issue-manager.yml
vendored
1
.github/workflows/issue-manager.yml
vendored
|
@ -13,6 +13,7 @@ on:
|
|||
|
||||
jobs:
|
||||
issue-manager:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: tiangolo/issue-manager@0.4.0
|
||||
|
|
1
.github/workflows/lock.yml
vendored
1
.github/workflows/lock.yml
vendored
|
@ -13,6 +13,7 @@ concurrency:
|
|||
|
||||
jobs:
|
||||
action:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: dessant/lock-threads@v4
|
||||
|
|
1
.github/workflows/spacy_universe_alert.yml
vendored
1
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -7,6 +7,7 @@ on:
|
|||
|
||||
jobs:
|
||||
build:
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
|
174
.github/workflows/tests.yml
vendored
Normal file
174
.github/workflows/tests.yml
vendored
Normal file
|
@ -0,0 +1,174 @@
|
|||
name: tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
- ".github/workflows/**"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths-ignore:
|
||||
- "*.md"
|
||||
- "*.mdx"
|
||||
- "website/**"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: black
|
||||
run: |
|
||||
python -m pip install black -c requirements.txt
|
||||
python -m black spacy --check
|
||||
- name: flake8
|
||||
run: |
|
||||
python -m pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
tests:
|
||||
name: Test
|
||||
needs: Validate
|
||||
strategy:
|
||||
fail-fast: true
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||
python_version: ["3.11"]
|
||||
include:
|
||||
- os: ubuntu-20.04
|
||||
python_version: "3.6"
|
||||
- os: windows-latest
|
||||
python_version: "3.7"
|
||||
- os: macos-latest
|
||||
python_version: "3.8"
|
||||
- os: ubuntu-latest
|
||||
python_version: "3.9"
|
||||
- os: windows-latest
|
||||
python_version: "3.10"
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python_version }}
|
||||
architecture: x64
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install -U build pip setuptools
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: Build sdist
|
||||
run: |
|
||||
python -m build --sdist
|
||||
|
||||
- name: Run mypy
|
||||
run: |
|
||||
python -m mypy spacy
|
||||
if: matrix.python_version != '3.6'
|
||||
|
||||
- name: Delete source directory and .egg-info
|
||||
run: |
|
||||
rm -rf spacy *.egg-info
|
||||
shell: bash
|
||||
|
||||
- name: Uninstall all packages
|
||||
run: |
|
||||
python -m pip freeze
|
||||
python -m pip freeze --exclude pywin32 > installed.txt
|
||||
python -m pip uninstall -y -r installed.txt
|
||||
|
||||
- name: Install from sdist
|
||||
run: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
SPACY_NUM_BUILD_JOBS=2 python -m pip install dist/$SDIST
|
||||
shell: bash
|
||||
|
||||
- name: Test import
|
||||
run: python -W error -c "import spacy"
|
||||
|
||||
# - name: "Test download CLI"
|
||||
# run: |
|
||||
# python -m spacy download ca_core_news_sm
|
||||
# python -m spacy download ca_core_news_md
|
||||
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test download_url in info CLI"
|
||||
# run: |
|
||||
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test no warnings on load (#11713)"
|
||||
# run: |
|
||||
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||
# if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test convert CLI"
|
||||
run: |
|
||||
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug config CLI"
|
||||
run: |
|
||||
python -m spacy init config -p ner -l ca ner.cfg
|
||||
python -m spacy debug config ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test debug data CLI"
|
||||
run: |
|
||||
# will have errors due to sparse data, check for summary in output
|
||||
python -m spacy debug data ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy | grep -q Summary
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Test train CLI"
|
||||
run: |
|
||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||
if: matrix.python_version == '3.9'
|
||||
|
||||
# - name: "Test assemble CLI"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||
# if: matrix.python_version == '3.9'
|
||||
#
|
||||
# - name: "Test assemble CLI vectors warning"
|
||||
# run: |
|
||||
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||
# if: matrix.python_version == '3.9'
|
||||
|
||||
- name: "Install test requirements"
|
||||
run: |
|
||||
python -m pip install -U -r requirements.txt
|
||||
|
||||
- name: "Run CPU tests"
|
||||
run: |
|
||||
python -m pytest --pyargs spacy -W error
|
||||
if: "!(startsWith(matrix.os, 'macos') && matrix.python_version == '3.11')"
|
||||
|
||||
- name: "Run CPU tests with thinc-apple-ops"
|
||||
run: |
|
||||
python -m pip install 'spacy[apple]'
|
||||
python -m pytest --pyargs spacy
|
||||
if: startsWith(matrix.os, 'macos') && matrix.python_version == '3.11'
|
33
.github/workflows/universe_validation.yml
vendored
Normal file
33
.github/workflows/universe_validation.yml
vendored
Normal file
|
@ -0,0 +1,33 @@
|
|||
name: universe validation
|
||||
|
||||
on:
|
||||
push:
|
||||
branches-ignore:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, edited]
|
||||
paths:
|
||||
- "website/meta/universe.json"
|
||||
|
||||
jobs:
|
||||
validate:
|
||||
name: Validate
|
||||
if: github.repository_owner == 'explosion'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repo
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Configure Python version
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.7"
|
||||
architecture: x64
|
||||
|
||||
- name: Validate website/meta/universe.json
|
||||
run: |
|
||||
python .github/validate_universe_json.py website/meta/universe.json
|
|
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
|||
Python modules. If you've built spaCy from source, you'll already have both
|
||||
tools installed.
|
||||
|
||||
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||
One exception are calls to Python's `logging` functionality.
|
||||
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||
templates with `%s` and `%d` etc.
|
||||
|
||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||
|
||||
|
|
32
README.md
32
README.md
|
@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
|
|||
model packaging, deployment and workflow management. spaCy is commercial
|
||||
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||
|
||||
💥 **We'd love to hear more about your experience with spaCy!**
|
||||
[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f)
|
||||
|
||||
💫 **Version 3.5 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
|
@ -32,19 +35,20 @@ open-source software, released under the [MIT license](https://github.com/explos
|
|||
|
||||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| Documentation | |
|
||||
| ----------------------------- | ---------------------------------------------------------------------- |
|
||||
| ⭐️ **[spaCy 101]** | New to spaCy? Here's everything you need to know! |
|
||||
| 📚 **[Usage Guides]** | How to use spaCy and its features. |
|
||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||
| 🛠 **[Changelog]** | Changes and version history. |
|
||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||
|
||||
|
@ -54,13 +58,13 @@ open-source software, released under the [MIT license](https://github.com/explos
|
|||
[api reference]: https://spacy.io/api/
|
||||
[models]: https://spacy.io/models
|
||||
[universe]: https://spacy.io/universe
|
||||
[spaCy VS Code Extension]: https://github.com/explosion/spacy-vscode
|
||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||
[online course]: https://course.spacy.io
|
||||
[project templates]: https://github.com/explosion/projects
|
||||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
||||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
|
||||
|
|
|
@ -1,103 +0,0 @@
|
|||
trigger:
|
||||
batch: true
|
||||
branches:
|
||||
include:
|
||||
- "*"
|
||||
exclude:
|
||||
- "spacy.io"
|
||||
- "nightly.spacy.io"
|
||||
- "v2.spacy.io"
|
||||
paths:
|
||||
exclude:
|
||||
- "website/*"
|
||||
- "*.md"
|
||||
- ".github/workflows/*"
|
||||
pr:
|
||||
paths:
|
||||
exclude:
|
||||
- "*.md"
|
||||
- "website/docs/*"
|
||||
- "website/src/*"
|
||||
- ".github/workflows/*"
|
||||
|
||||
jobs:
|
||||
# Perform basic checks for most important errors (syntax etc.) Uses the config
|
||||
# defined in .flake8 and overwrites the selected codes.
|
||||
- job: "Validate"
|
||||
pool:
|
||||
vmImage: "ubuntu-latest"
|
||||
steps:
|
||||
- task: UsePythonVersion@0
|
||||
inputs:
|
||||
versionSpec: "3.7"
|
||||
- script: |
|
||||
pip install flake8==5.0.4
|
||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||
displayName: "flake8"
|
||||
|
||||
- job: "Test"
|
||||
dependsOn: "Validate"
|
||||
strategy:
|
||||
matrix:
|
||||
# We're only running one platform per Python version to speed up builds
|
||||
Python36Linux:
|
||||
imageName: "ubuntu-20.04"
|
||||
python.version: "3.6"
|
||||
# Python36Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.6"
|
||||
# Python36Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.6"
|
||||
# Python37Linux:
|
||||
# imageName: "ubuntu-20.04"
|
||||
# python.version: "3.7"
|
||||
Python37Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.7"
|
||||
# Python37Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.7"
|
||||
# Python38Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.8"
|
||||
# Python38Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.8"
|
||||
Python38Mac:
|
||||
imageName: "macos-latest"
|
||||
python.version: "3.8"
|
||||
Python39Linux:
|
||||
imageName: "ubuntu-latest"
|
||||
python.version: "3.9"
|
||||
# Python39Windows:
|
||||
# imageName: "windows-latest"
|
||||
# python.version: "3.9"
|
||||
# Python39Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.9"
|
||||
# Python310Linux:
|
||||
# imageName: "ubuntu-latest"
|
||||
# python.version: "3.10"
|
||||
Python310Windows:
|
||||
imageName: "windows-latest"
|
||||
python.version: "3.10"
|
||||
# Python310Mac:
|
||||
# imageName: "macos-latest"
|
||||
# python.version: "3.10"
|
||||
Python311Linux:
|
||||
imageName: 'ubuntu-latest'
|
||||
python.version: '3.11'
|
||||
Python311Windows:
|
||||
imageName: 'windows-latest'
|
||||
python.version: '3.11'
|
||||
Python311Mac:
|
||||
imageName: 'macos-latest'
|
||||
python.version: '3.11'
|
||||
maxParallel: 4
|
||||
pool:
|
||||
vmImage: $(imageName)
|
||||
steps:
|
||||
- template: .github/azure-steps.yml
|
||||
parameters:
|
||||
python_version: '$(python.version)'
|
|
@ -5,7 +5,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.1.0,<8.2.0",
|
||||
"thinc>=8.1.8,<8.2.0",
|
||||
"numpy>=1.15.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
|
|
@ -3,13 +3,13 @@ spacy-legacy>=3.0.11,<3.1.0
|
|||
spacy-loggers>=1.0.0,<2.0.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
ml_datasets>=0.2.0,<0.3.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
typer>=0.3.0,<0.8.0
|
||||
typer>=0.3.0,<0.10.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
# Third party dependencies
|
||||
|
@ -31,10 +31,10 @@ pytest-timeout>=1.3.0,<2.0.0
|
|||
mock>=2.0.0,<3.0.0
|
||||
flake8>=3.8.0,<6.0.0
|
||||
hypothesis>=3.27.0,<7.0.0
|
||||
mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
|
||||
types-dataclasses>=0.1.3; python_version < "3.7"
|
||||
types-mock>=0.1.1
|
||||
types-setuptools>=57.0.0
|
||||
types-requests
|
||||
types-setuptools>=57.0.0
|
||||
black>=22.0,<23.0
|
||||
black==22.3.0
|
||||
|
|
42
setup.cfg
42
setup.cfg
|
@ -39,7 +39,7 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.11,<3.1.0
|
||||
|
@ -47,12 +47,12 @@ install_requires =
|
|||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.1.0,<8.2.0
|
||||
thinc>=8.1.8,<8.2.0
|
||||
wasabi>=0.9.1,<1.2.0
|
||||
srsly>=2.4.3,<3.0.0
|
||||
catalogue>=2.0.6,<2.1.0
|
||||
# Third-party dependencies
|
||||
typer>=0.3.0,<0.8.0
|
||||
typer>=0.3.0,<0.10.0
|
||||
pathy>=0.10.0
|
||||
smart-open>=5.2.1,<7.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
|
@ -78,41 +78,41 @@ transformers =
|
|||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<12.0.0
|
||||
cupy>=5.0.0b4,<13.0.0
|
||||
cuda80 =
|
||||
cupy-cuda80>=5.0.0b4,<12.0.0
|
||||
cupy-cuda80>=5.0.0b4,<13.0.0
|
||||
cuda90 =
|
||||
cupy-cuda90>=5.0.0b4,<12.0.0
|
||||
cupy-cuda90>=5.0.0b4,<13.0.0
|
||||
cuda91 =
|
||||
cupy-cuda91>=5.0.0b4,<12.0.0
|
||||
cupy-cuda91>=5.0.0b4,<13.0.0
|
||||
cuda92 =
|
||||
cupy-cuda92>=5.0.0b4,<12.0.0
|
||||
cupy-cuda92>=5.0.0b4,<13.0.0
|
||||
cuda100 =
|
||||
cupy-cuda100>=5.0.0b4,<12.0.0
|
||||
cupy-cuda100>=5.0.0b4,<13.0.0
|
||||
cuda101 =
|
||||
cupy-cuda101>=5.0.0b4,<12.0.0
|
||||
cupy-cuda101>=5.0.0b4,<13.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<12.0.0
|
||||
cupy-cuda102>=5.0.0b4,<13.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<12.0.0
|
||||
cupy-cuda110>=5.0.0b4,<13.0.0
|
||||
cuda111 =
|
||||
cupy-cuda111>=5.0.0b4,<12.0.0
|
||||
cupy-cuda111>=5.0.0b4,<13.0.0
|
||||
cuda112 =
|
||||
cupy-cuda112>=5.0.0b4,<12.0.0
|
||||
cupy-cuda112>=5.0.0b4,<13.0.0
|
||||
cuda113 =
|
||||
cupy-cuda113>=5.0.0b4,<12.0.0
|
||||
cupy-cuda113>=5.0.0b4,<13.0.0
|
||||
cuda114 =
|
||||
cupy-cuda114>=5.0.0b4,<12.0.0
|
||||
cupy-cuda114>=5.0.0b4,<13.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<12.0.0
|
||||
cupy-cuda115>=5.0.0b4,<13.0.0
|
||||
cuda116 =
|
||||
cupy-cuda116>=5.0.0b4,<12.0.0
|
||||
cupy-cuda116>=5.0.0b4,<13.0.0
|
||||
cuda117 =
|
||||
cupy-cuda117>=5.0.0b4,<12.0.0
|
||||
cupy-cuda117>=5.0.0b4,<13.0.0
|
||||
cuda11x =
|
||||
cupy-cuda11x>=11.0.0,<12.0.0
|
||||
cupy-cuda11x>=11.0.0,<13.0.0
|
||||
cuda-autodetect =
|
||||
cupy-wheel>=11.0.0,<12.0.0
|
||||
cupy-wheel>=11.0.0,<13.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.5.0"
|
||||
__version__ = "3.6.0.dev0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
__projects__ = "https://github.com/explosion/projects"
|
||||
|
|
|
@ -90,9 +90,9 @@ def parse_config_overrides(
|
|||
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||
if cli_overrides:
|
||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||
logger.debug(f"Config overrides from CLI: {keys}")
|
||||
logger.debug("Config overrides from CLI: %s", keys)
|
||||
if env_overrides:
|
||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
||||
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||
return {**cli_overrides, **env_overrides}
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import srsly
|
|||
from wasabi import Printer, MESSAGES, msg
|
||||
import typer
|
||||
import math
|
||||
import numpy
|
||||
|
||||
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
|
||||
from ._util import import_code, debug_cli, _format_number
|
||||
|
@ -17,6 +18,7 @@ from ..pipeline import TrainablePipe
|
|||
from ..pipeline._parser_internals import nonproj
|
||||
from ..pipeline._parser_internals.nonproj import DELIMITER
|
||||
from ..pipeline import Morphologizer, SpanCategorizer
|
||||
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
|
||||
from ..morphology import Morphology
|
||||
from ..language import Language
|
||||
from ..util import registry, resolve_dot_names
|
||||
|
@ -335,7 +337,7 @@ def debug_data(
|
|||
show=verbose,
|
||||
)
|
||||
else:
|
||||
msg.good("Examples without ocurrences available for all labels")
|
||||
msg.good("Examples without occurrences available for all labels")
|
||||
|
||||
if "ner" in factory_names:
|
||||
# Get all unique NER labels present in the data
|
||||
|
@ -520,9 +522,13 @@ def debug_data(
|
|||
|
||||
if "tagger" in factory_names:
|
||||
msg.divider("Part-of-speech Tagging")
|
||||
label_list = [label for label in gold_train_data["tags"]]
|
||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||
label_list, counts = zip(*gold_train_data["tags"].items())
|
||||
msg.info(f"{len(label_list)} label(s) in train data")
|
||||
p = numpy.array(counts)
|
||||
p = p / p.sum()
|
||||
norm_entropy = (-p * numpy.log2(p)).sum() / numpy.log2(len(label_list))
|
||||
msg.info(f"{norm_entropy} is the normalised label entropy")
|
||||
model_labels = _get_labels_from_model(nlp, "tagger")
|
||||
labels = set(label_list)
|
||||
missing_labels = model_labels - labels
|
||||
if missing_labels:
|
||||
|
@ -671,6 +677,59 @@ def debug_data(
|
|||
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
|
||||
)
|
||||
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
msg.divider("Trainable Lemmatizer")
|
||||
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
|
||||
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
|
||||
# This is necessary context when someone is attempting to interpret whether the
|
||||
# number of trees exclusively in the dev set is meaningful.
|
||||
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
|
||||
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
|
||||
dev_not_train = trees_dev - trees_train
|
||||
|
||||
if len(dev_not_train) != 0:
|
||||
pct = len(dev_not_train) / len(trees_dev)
|
||||
msg.info(
|
||||
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
|
||||
" were found exclusively in the dev data."
|
||||
)
|
||||
else:
|
||||
# Would we ever expect this case? It seems like it would be pretty rare,
|
||||
# and we might actually want a warning?
|
||||
msg.info("All trees in dev data present in training data.")
|
||||
|
||||
if gold_train_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_train_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
|
||||
n = gold_dev_data["n_low_cardinality_lemmas"]
|
||||
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
|
||||
|
||||
if gold_train_data["no_lemma_annotations"] > 0:
|
||||
n = gold_train_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} training docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have lemma annotations.")
|
||||
|
||||
if gold_dev_data["no_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["no_lemma_annotations"]
|
||||
msg.warn(f"{n} dev docs with no lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have lemma annotations.")
|
||||
|
||||
if gold_train_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_train_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} training docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All training docs have complete lemma annotations.")
|
||||
|
||||
if gold_dev_data["partial_lemma_annotations"] > 0:
|
||||
n = gold_dev_data["partial_lemma_annotations"]
|
||||
msg.info(f"{n} dev docs with partial lemma annotations.")
|
||||
else:
|
||||
msg.good("All dev docs have complete lemma annotations.")
|
||||
|
||||
msg.divider("Summary")
|
||||
good_counts = msg.counts[MESSAGES.GOOD]
|
||||
warn_counts = msg.counts[MESSAGES.WARN]
|
||||
|
@ -732,7 +791,13 @@ def _compile_gold(
|
|||
"n_cats_multilabel": 0,
|
||||
"n_cats_bad_values": 0,
|
||||
"texts": set(),
|
||||
"lemmatizer_trees": set(),
|
||||
"no_lemma_annotations": 0,
|
||||
"partial_lemma_annotations": 0,
|
||||
"n_low_cardinality_lemmas": 0,
|
||||
}
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
trees = EditTrees(nlp.vocab.strings)
|
||||
for eg in examples:
|
||||
gold = eg.reference
|
||||
doc = eg.predicted
|
||||
|
@ -862,6 +927,25 @@ def _compile_gold(
|
|||
data["n_nonproj"] += 1
|
||||
if nonproj.contains_cycle(aligned_heads):
|
||||
data["n_cycles"] += 1
|
||||
if "trainable_lemmatizer" in factory_names:
|
||||
# from EditTreeLemmatizer._labels_from_data
|
||||
if all(token.lemma == 0 for token in gold):
|
||||
data["no_lemma_annotations"] += 1
|
||||
continue
|
||||
if any(token.lemma == 0 for token in gold):
|
||||
data["partial_lemma_annotations"] += 1
|
||||
lemma_set = set()
|
||||
for token in gold:
|
||||
if token.lemma != 0:
|
||||
lemma_set.add(token.lemma)
|
||||
tree_id = trees.add(token.text, token.lemma_)
|
||||
tree_str = trees.tree_to_str(tree_id)
|
||||
data["lemmatizer_trees"].add(tree_str)
|
||||
# We want to identify cases where lemmas aren't assigned
|
||||
# or are all assigned the same value, as this would indicate
|
||||
# an issue since we're expecting a large set of lemmas
|
||||
if len(lemma_set) < 2 and len(gold) > 1:
|
||||
data["n_low_cardinality_lemmas"] += 1
|
||||
return data
|
||||
|
||||
|
||||
|
|
|
@ -81,11 +81,8 @@ def download(
|
|||
|
||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||
egg_tpl = "#egg={m}=={v}"
|
||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||
if sdist:
|
||||
filename += egg_tpl.format(m=model_name, v=version)
|
||||
return filename
|
||||
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ def evaluate_cli(
|
|||
gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
|
||||
displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
|
||||
per_component: bool = Opt(False, "--per-component", "-P", help="Return scores per component, only applicable when an output JSON file is specified."),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -50,6 +51,7 @@ def evaluate_cli(
|
|||
gold_preproc=gold_preproc,
|
||||
displacy_path=displacy_path,
|
||||
displacy_limit=displacy_limit,
|
||||
per_component=per_component,
|
||||
silent=False,
|
||||
)
|
||||
|
||||
|
@ -64,6 +66,7 @@ def evaluate(
|
|||
displacy_limit: int = 25,
|
||||
silent: bool = True,
|
||||
spans_key: str = "sc",
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
msg = Printer(no_print=silent, pretty=not silent)
|
||||
fix_random_seed()
|
||||
|
@ -78,50 +81,61 @@ def evaluate(
|
|||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
scores = nlp.evaluate(dev_dataset)
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
scores = nlp.evaluate(dev_dataset, per_component=per_component)
|
||||
if per_component:
|
||||
data = scores
|
||||
if output is None:
|
||||
msg.warn(
|
||||
"The per-component option is enabled but there is no output JSON file provided to save the scores to."
|
||||
)
|
||||
else:
|
||||
msg.info("Per-component scores will be saved to output JSON file.")
|
||||
else:
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
"POS": "pos_acc",
|
||||
"MORPH": "morph_acc",
|
||||
"LEMMA": "lemma_acc",
|
||||
"UAS": "dep_uas",
|
||||
"LAS": "dep_las",
|
||||
"NER P": "ents_p",
|
||||
"NER R": "ents_r",
|
||||
"NER F": "ents_f",
|
||||
"TEXTCAT": "cats_score",
|
||||
"SENT P": "sents_p",
|
||||
"SENT R": "sents_r",
|
||||
"SENT F": "sents_f",
|
||||
"SPAN P": f"spans_{spans_key}_p",
|
||||
"SPAN R": f"spans_{spans_key}_r",
|
||||
"SPAN F": f"spans_{spans_key}_f",
|
||||
"SPEED": "speed",
|
||||
}
|
||||
results = {}
|
||||
data = {}
|
||||
for metric, key in metrics.items():
|
||||
if key in scores:
|
||||
if key == "cats_score":
|
||||
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||
if isinstance(scores[key], (int, float)):
|
||||
if key == "speed":
|
||||
results[metric] = f"{scores[key]:.0f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = f"{scores[key]*100:.2f}"
|
||||
else:
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
results[metric] = "-"
|
||||
data[re.sub(r"[\s/]", "_", key.lower())] = scores[key]
|
||||
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
msg.table(results, title="Results")
|
||||
data = handle_scores_per_type(scores, data, spans_key=spans_key, silent=silent)
|
||||
|
||||
if displacy_path:
|
||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||
docs = list(nlp.pipe(ex.reference.text for ex in dev_dataset[:displacy_limit]))
|
||||
render_deps = "parser" in factory_names
|
||||
render_ents = "ner" in factory_names
|
||||
render_spans = "spancat" in factory_names
|
||||
|
||||
render_parses(
|
||||
docs,
|
||||
displacy_path,
|
||||
|
@ -129,6 +143,7 @@ def evaluate(
|
|||
limit=displacy_limit,
|
||||
deps=render_deps,
|
||||
ents=render_ents,
|
||||
spans=render_spans,
|
||||
)
|
||||
msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
|
||||
|
||||
|
@ -182,6 +197,7 @@ def render_parses(
|
|||
limit: int = 250,
|
||||
deps: bool = True,
|
||||
ents: bool = True,
|
||||
spans: bool = True,
|
||||
):
|
||||
docs[0].user_data["title"] = model_name
|
||||
if ents:
|
||||
|
@ -195,6 +211,11 @@ def render_parses(
|
|||
with (output_path / "parses.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
if spans:
|
||||
html = displacy.render(docs[:limit], style="span", page=True)
|
||||
with (output_path / "spans.html").open("w", encoding="utf8") as file_:
|
||||
file_.write(html)
|
||||
|
||||
|
||||
def print_prf_per_type(
|
||||
msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str
|
||||
|
|
|
@ -35,7 +35,7 @@ def find_threshold_cli(
|
|||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
use_gpu: int = Opt(_DEFAULTS["use_gpu"], "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
gold_preproc: bool = Opt(_DEFAULTS["gold_preproc"], "--gold-preproc", "-G", help="Use gold preprocessing"),
|
||||
verbose: bool = Opt(False, "--silent", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from typing import Optional, Dict, Any, Union, List
|
||||
import platform
|
||||
import pkg_resources
|
||||
import json
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, MarkdownRenderer
|
||||
|
@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list
|
|||
from .download import get_model_filename, get_latest_version
|
||||
from .. import util
|
||||
from .. import about
|
||||
from ..compat import importlib_metadata
|
||||
|
||||
|
||||
@app.command("info")
|
||||
|
@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]:
|
|||
dist-info available.
|
||||
"""
|
||||
try:
|
||||
dist = pkg_resources.get_distribution(model)
|
||||
data = json.loads(dist.get_metadata("direct_url.json"))
|
||||
return data["url"]
|
||||
except pkg_resources.DistributionNotFound:
|
||||
# no such package
|
||||
return None
|
||||
dist = importlib_metadata.distribution(model)
|
||||
text = dist.read_text("direct_url.json")
|
||||
if isinstance(text, str):
|
||||
data = json.loads(text)
|
||||
return data["url"]
|
||||
except Exception:
|
||||
# something else, like no file or invalid JSON
|
||||
return None
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def info_model_url(model: str) -> Dict[str, Any]:
|
||||
|
|
|
@ -252,7 +252,7 @@ def get_third_party_dependencies(
|
|||
raise regerr from None
|
||||
module_name = func_info.get("module") # type: ignore[attr-defined]
|
||||
if module_name: # the code is part of a module, not a --code file
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[index]
|
||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||
dependencies = []
|
||||
for module_name in modules:
|
||||
if module_name in distributions:
|
||||
|
|
|
@ -23,6 +23,7 @@ def pretrain_cli(
|
|||
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
|
||||
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
|
||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
skip_last: bool = Opt(False, "--skip-last", "-L", help="Skip saving model-last.bin"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
|
@ -74,6 +75,7 @@ def pretrain_cli(
|
|||
epoch_resume=epoch_resume,
|
||||
use_gpu=use_gpu,
|
||||
silent=False,
|
||||
skip_last=skip_last,
|
||||
)
|
||||
msg.good("Successfully finished pretrain")
|
||||
|
||||
|
|
|
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
logger.debug(f"CMD: {cmd['name']}.")
|
||||
logger.debug("CMD: %s.", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
logger.debug(
|
||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
||||
"URL: %s for %s with command hash %s",
|
||||
url,
|
||||
output_path,
|
||||
cmd_hash,
|
||||
)
|
||||
yield url, output_path
|
||||
|
||||
|
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
|||
commands.pop(i)
|
||||
break
|
||||
else:
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
||||
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
|||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
logger.debug(f"CMD: cmd['name']")
|
||||
logger.debug("CMD: %s", cmd["name"])
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
||||
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||
continue
|
||||
cmd_hash = get_command_hash(
|
||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||
)
|
||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
||||
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||
for output_path in cmd.get("outputs", []):
|
||||
output_loc = project_dir / output_path
|
||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||
|
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
|
|||
content_hash=get_content_hash(output_loc),
|
||||
)
|
||||
logger.debug(
|
||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
||||
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||
)
|
||||
yield output_path, url
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple
|
|||
import os.path
|
||||
from pathlib import Path
|
||||
|
||||
import pkg_resources
|
||||
from wasabi import msg
|
||||
from wasabi.util import locale_escape
|
||||
import sys
|
||||
|
@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]:
|
|||
RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts
|
||||
exist.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
failed_pkgs_msgs: List[str] = []
|
||||
conflicting_pkgs_msgs: List[str] = []
|
||||
|
|
|
@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and
|
|||
can help generate the best possible configuration, given a user's requirements. #}
|
||||
{%- set use_transformer = hardware != "cpu" and transformer_data -%}
|
||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
|
||||
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%}
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
@ -24,8 +24,11 @@ gpu_allocator = null
|
|||
lang = "{{ lang }}"
|
||||
{%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%}
|
||||
{%- set with_accuracy = optimize == "accuracy" -%}
|
||||
{%- set has_accurate_textcat = has_textcat and with_accuracy -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%}
|
||||
{# The BOW textcat doesn't need a source of features, so it can omit the
|
||||
tok2vec/transformer. #}
|
||||
{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%}
|
||||
{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%}
|
||||
{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%}
|
||||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%}
|
||||
{%- else -%}
|
||||
{%- set full_pipeline = components -%}
|
||||
|
@ -156,6 +159,36 @@ grad_factor = 1.0
|
|||
sizes = [1,2,3]
|
||||
{% endif -%}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
@ -221,10 +254,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = true
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -252,10 +291,16 @@ no_output_layer = false
|
|||
|
||||
{% else -%}
|
||||
[components.textcat_multilabel.model]
|
||||
@architectures = "spacy.TextCatBOW.v2"
|
||||
@architectures = "spacy.TextCatCNN.v2"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
nO = null
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.textcat_multilabel.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
|
@ -286,6 +331,7 @@ maxout_pieces = 3
|
|||
{% if "morphologizer" in components %}
|
||||
[components.morphologizer]
|
||||
factory = "morphologizer"
|
||||
label_smoothing = 0.05
|
||||
|
||||
[components.morphologizer.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -299,6 +345,7 @@ width = ${components.tok2vec.model.encode.width}
|
|||
{% if "tagger" in components %}
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
label_smoothing = 0.05
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v2"
|
||||
|
@ -374,6 +421,33 @@ width = ${components.tok2vec.model.encode.width}
|
|||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "spancat_singlelabel" in components %}
|
||||
[components.spancat_singlelabel]
|
||||
factory = "spancat_singlelabel"
|
||||
negative_weight = 1.0
|
||||
allow_overlap = true
|
||||
scorer = {"@scorers":"spacy.spancat_scorer.v1"}
|
||||
spans_key = "sc"
|
||||
|
||||
[components.spancat_singlelabel.model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
|
||||
[components.spancat_singlelabel.model.reducer]
|
||||
@layers = "spacy.mean_max_reducer.v1"
|
||||
hidden_size = 128
|
||||
|
||||
[components.spancat_singlelabel.model.scorer]
|
||||
@layers = "Softmax.v2"
|
||||
|
||||
[components.spancat_singlelabel.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
|
||||
[components.spancat_singlelabel.suggester]
|
||||
@misc = "spacy.ngram_suggester.v1"
|
||||
sizes = [1,2,3]
|
||||
{% endif %}
|
||||
|
||||
{% if "trainable_lemmatizer" in components -%}
|
||||
[components.trainable_lemmatizer]
|
||||
factory = "trainable_lemmatizer"
|
||||
|
|
|
@ -125,13 +125,17 @@ def app(environ, start_response):
|
|||
return [res]
|
||||
|
||||
|
||||
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
||||
def parse_deps(
|
||||
orig_doc: Union[Doc, Span], options: Dict[str, Any] = {}
|
||||
) -> Dict[str, Any]:
|
||||
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
||||
|
||||
orig_doc (Doc): Document to parse.
|
||||
orig_doc (Union[Doc, Span]): Document to parse.
|
||||
options (Dict[str, Any]): Dependency parse specific visualisation options.
|
||||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
if isinstance(orig_doc, Span):
|
||||
orig_doc = orig_doc.as_doc()
|
||||
doc = Doc(orig_doc.vocab).from_bytes(
|
||||
orig_doc.to_bytes(exclude=["user_data", "user_hooks"])
|
||||
)
|
||||
|
|
|
@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
|
||||
"exceed 1, but found {sum}.")
|
||||
E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty. Use the methods "
|
||||
"`kb.add_entity` and `kb.add_alias` to add entries.")
|
||||
E139 = ("Knowledge base for component '{name}' is empty.")
|
||||
E140 = ("The list of entities, prior probabilities and entity vectors "
|
||||
"should be of equal length.")
|
||||
E141 = ("Entity vectors should be of length {required} instead of the "
|
||||
|
@ -550,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E850 = ("The PretrainVectors objective currently only supports default or "
|
||||
"floret vectors, not {mode} vectors.")
|
||||
E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
|
||||
"but found value of '{val}'.")
|
||||
E852 = ("The tar file pulled from the remote attempted an unsafe path "
|
||||
|
@ -967,7 +968,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
|
||||
"with `displacy.serve(doc, port=port)`")
|
||||
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
|
||||
"or use `auto_switch_port=True` to pick an available port automatically.")
|
||||
"or use `auto_select_port=True` to pick an available port automatically.")
|
||||
E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.")
|
||||
|
||||
|
||||
# Deprecated model shortcuts, only used in errors and warnings
|
||||
|
|
|
@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase):
|
|||
self._alias_index = PreshMap(nr_aliases + 1)
|
||||
self._aliases_table = alias_vec(nr_aliases + 1)
|
||||
|
||||
def is_empty(self):
|
||||
return len(self) == 0
|
||||
|
||||
def __len__(self):
|
||||
return self.get_size_entities()
|
||||
|
||||
|
|
|
@ -2,12 +2,14 @@ from ...language import Language, BaseDefaults
|
|||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class LatinDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Latin(Language):
|
||||
|
|
22
spacy/lang/la/examples.py
Normal file
22
spacy/lang/la/examples.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.la.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
# > Caes. BG 1.1
|
||||
# > Cic. De Amic. 1
|
||||
# > V. Georg. 1.1-5
|
||||
# > Gen. 1:1
|
||||
# > Galileo, Sid. Nunc.
|
||||
# > van Schurman, Opusc. arg. 1
|
||||
|
||||
sentences = [
|
||||
"Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur.",
|
||||
"Q. Mucius augur multa narrare de C. Laelio socero suo memoriter et iucunde solebat nec dubitare illum in omni sermone appellare sapientem.",
|
||||
"Quid faciat laetas segetes, quo sidere terram uertere, Maecenas, ulmisque adiungere uitis conueniat, quae cura boum, qui cultus habendo sit pecori, apibus quanta experientia parcis, hinc canere incipiam",
|
||||
"In principio creavit Deus caelum et terram.",
|
||||
"Quo sumpto, intelligatur lunaris globus, cuius maximus circulus CAF, centrum vero E, dimetiens CF, qui ad Terre diametrum est ut duo ad septem.",
|
||||
"Cuicunque natura indita sunt principia, seu potentiae principiorum omnium artium, ac scientiarum, ei conveniunt omnes artes ac scientiae.",
|
||||
]
|
|
@ -6,17 +6,16 @@ roman_numerals_compile = re.compile(
|
|||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$"
|
||||
)
|
||||
|
||||
_num_words = set(
|
||||
"""
|
||||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem
|
||||
_num_words = """unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem undecim duodecim tredecim quattuordecim quindecim sedecim septendecim duodeviginti undeviginti viginti triginta quadraginta quinquaginta sexaginta septuaginta octoginta nonaginta centum ducenti ducentae ducenta trecenti trecentae trecenta quadringenti quadringentae quadringenta quingenti quingentae quingenta sescenti sescentae sescenta septingenti septingentae septingenta octingenti octingentae octingenta nongenti nongentae nongenta mille
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
primus prima primum secundus secunda secundum tertius tertia tertium
|
||||
""".split()
|
||||
)
|
||||
_num_words += [item.replace("v", "u") for item in _num_words]
|
||||
_num_words = set(_num_words)
|
||||
|
||||
_ordinal_words = """primus prima primum secundus secunda secundum tertius tertia tertium quartus quarta quartum quintus quinta quintum sextus sexta sextum septimus septima septimum octavus octava octavum nonus nona nonum decimus decima decimum undecimus undecima undecimum duodecimus duodecima duodecimum duodevicesimus duodevicesima duodevicesimum undevicesimus undevicesima undevicesimum vicesimus vicesima vicesimum tricesimus tricesima tricesimum quadragesimus quadragesima quadragesimum quinquagesimus quinquagesima quinquagesimum sexagesimus sexagesima sexagesimum septuagesimus septuagesima septuagesimum octogesimus octogesima octogesimum nonagesimus nonagesima nonagesimum centesimus centesima centesimum ducentesimus ducentesima ducentesimum trecentesimus trecentesima trecentesimum quadringentesimus quadringentesima quadringentesimum quingentesimus quingentesima quingentesimum sescentesimus sescentesima sescentesimum septingentesimus septingentesima septingentesimum octingentesimus octingentesima octingentesimum nongentesimus nongentesima nongentesimum millesimus millesima millesimum""".split()
|
||||
|
||||
_ordinal_words += [item.replace("v", "u") for item in _ordinal_words]
|
||||
_ordinal_words = set(_ordinal_words)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
|
|
85
spacy/lang/la/syntax_iterators.py
Normal file
85
spacy/lang/la/syntax_iterators.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
from ...tokens import Doc, Span
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB, AUX
|
||||
from ...errors import Errors
|
||||
|
||||
# NB: Modified from da on suggestion from https://github.com/explosion/spaCy/issues/7457#issuecomment-800349751 [PJB]
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
def is_verb_token(tok):
|
||||
return tok.pos in [VERB, AUX]
|
||||
|
||||
def get_left_bound(root):
|
||||
left_bound = root
|
||||
for tok in reversed(list(root.lefts)):
|
||||
if tok.dep in np_left_deps:
|
||||
left_bound = tok
|
||||
return left_bound
|
||||
|
||||
def get_right_bound(doc, root):
|
||||
right_bound = root
|
||||
for tok in root.rights:
|
||||
if tok.dep in np_right_deps:
|
||||
right = get_right_bound(doc, tok)
|
||||
if list(
|
||||
filter(
|
||||
lambda t: is_verb_token(t) or t.dep in stop_deps,
|
||||
doc[root.i : right.i],
|
||||
)
|
||||
):
|
||||
break
|
||||
else:
|
||||
right_bound = right
|
||||
return right_bound
|
||||
|
||||
def get_bounds(doc, root):
|
||||
return get_left_bound(root), get_right_bound(doc, root)
|
||||
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
|
||||
if not len(doc):
|
||||
return
|
||||
|
||||
left_labels = [
|
||||
"det",
|
||||
"fixed",
|
||||
"nmod:poss",
|
||||
"amod",
|
||||
"flat",
|
||||
"goeswith",
|
||||
"nummod",
|
||||
"appos",
|
||||
]
|
||||
right_labels = [
|
||||
"fixed",
|
||||
"nmod:poss",
|
||||
"amod",
|
||||
"flat",
|
||||
"goeswith",
|
||||
"nummod",
|
||||
"appos",
|
||||
"nmod",
|
||||
"det",
|
||||
]
|
||||
stop_labels = ["punct"]
|
||||
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
np_left_deps = [doc.vocab.strings.add(label) for label in left_labels]
|
||||
np_right_deps = [doc.vocab.strings.add(label) for label in right_labels]
|
||||
stop_deps = [doc.vocab.strings.add(label) for label in stop_labels]
|
||||
|
||||
prev_right = -1
|
||||
for token in doclike:
|
||||
if token.pos in [PROPN, NOUN, PRON]:
|
||||
left, right = get_bounds(doc, token)
|
||||
if left.i <= prev_right:
|
||||
continue
|
||||
yield left.i, right.i + 1, np_label
|
||||
prev_right = right.i
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
@ -12,65 +12,15 @@ _exc = {
|
|||
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
||||
}
|
||||
|
||||
for orth in [
|
||||
"A.",
|
||||
"Agr.",
|
||||
"Ap.",
|
||||
"C.",
|
||||
"Cn.",
|
||||
"D.",
|
||||
"F.",
|
||||
"K.",
|
||||
"L.",
|
||||
"M'.",
|
||||
"M.",
|
||||
"Mam.",
|
||||
"N.",
|
||||
"Oct.",
|
||||
"Opet.",
|
||||
"P.",
|
||||
"Paul.",
|
||||
"Post.",
|
||||
"Pro.",
|
||||
"Q.",
|
||||
"S.",
|
||||
"Ser.",
|
||||
"Sert.",
|
||||
"Sex.",
|
||||
"St.",
|
||||
"Sta.",
|
||||
"T.",
|
||||
"Ti.",
|
||||
"V.",
|
||||
"Vol.",
|
||||
"Vop.",
|
||||
"U.",
|
||||
"Uol.",
|
||||
"Uop.",
|
||||
"Ian.",
|
||||
"Febr.",
|
||||
"Mart.",
|
||||
"Apr.",
|
||||
"Mai.",
|
||||
"Iun.",
|
||||
"Iul.",
|
||||
"Aug.",
|
||||
"Sept.",
|
||||
"Oct.",
|
||||
"Nov.",
|
||||
"Nou.",
|
||||
"Dec.",
|
||||
"Non.",
|
||||
"Id.",
|
||||
"A.D.",
|
||||
"Coll.",
|
||||
"Cos.",
|
||||
"Ord.",
|
||||
"Pl.",
|
||||
"S.C.",
|
||||
"Suff.",
|
||||
"Trib.",
|
||||
]:
|
||||
_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
|
||||
|
||||
_abbrev_exc += [item.lower() for item in _abbrev_exc]
|
||||
_abbrev_exc += [item.upper() for item in _abbrev_exc]
|
||||
_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
|
||||
|
||||
_abbrev_exc += ["d.N."]
|
||||
|
||||
for orth in set(_abbrev_exc):
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||
|
|
24
spacy/lang/ms/__init__.py
Normal file
24
spacy/lang/ms/__init__.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class MalayDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
prefixes = TOKENIZER_PREFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
infixes = TOKENIZER_INFIXES
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Malay(Language):
|
||||
lang = "ms"
|
||||
Defaults = MalayDefaults
|
||||
|
||||
|
||||
__all__ = ["Malay"]
|
1943
spacy/lang/ms/_tokenizer_exceptions_list.py
Normal file
1943
spacy/lang/ms/_tokenizer_exceptions_list.py
Normal file
File diff suppressed because it is too large
Load Diff
17
spacy/lang/ms/examples.py
Normal file
17
spacy/lang/ms/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.ms.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Malaysia ialah sebuah negara yang terletak di Asia Tenggara.",
|
||||
"Berapa banyak pelajar yang akan menghadiri majlis perpisahan sekolah?",
|
||||
"Pengeluaran makanan berasal dari beberapa lokasi termasuk Cameron Highlands, Johor Bahru, dan Kuching.",
|
||||
"Syarikat XYZ telah menghasilkan 20,000 unit produk baharu dalam setahun terakhir",
|
||||
"Kuala Lumpur merupakan ibu negara Malaysia." "Kau berada di mana semalam?",
|
||||
"Siapa yang akan memimpin projek itu?",
|
||||
"Siapa perdana menteri Malaysia sekarang?",
|
||||
]
|
66
spacy/lang/ms/lex_attrs.py
Normal file
66
spacy/lang/ms/lex_attrs.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import unicodedata
|
||||
|
||||
from .punctuation import LIST_CURRENCY
|
||||
from ...attrs import IS_CURRENCY, LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"kosong",
|
||||
"satu",
|
||||
"dua",
|
||||
"tiga",
|
||||
"empat",
|
||||
"lima",
|
||||
"enam",
|
||||
"tujuh",
|
||||
"lapan",
|
||||
"sembilan",
|
||||
"sepuluh",
|
||||
"sebelas",
|
||||
"belas",
|
||||
"puluh",
|
||||
"ratus",
|
||||
"ribu",
|
||||
"juta",
|
||||
"billion",
|
||||
"trillion",
|
||||
"kuadrilion",
|
||||
"kuintilion",
|
||||
"sekstilion",
|
||||
"septilion",
|
||||
"oktilion",
|
||||
"nonilion",
|
||||
"desilion",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text.lower() in _num_words:
|
||||
return True
|
||||
if text.count("-") == 1:
|
||||
_, num = text.split("-")
|
||||
if num.isdigit() or num in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_currency(text):
|
||||
if text in LIST_CURRENCY:
|
||||
return True
|
||||
|
||||
for char in text:
|
||||
if unicodedata.category(char) != "Sc":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
LEX_ATTRS = {IS_CURRENCY: is_currency, LIKE_NUM: like_num}
|
61
spacy/lang/ms/punctuation.py
Normal file
61
spacy/lang/ms/punctuation.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units
|
||||
|
||||
|
||||
_units = (
|
||||
_units + "s bit Gbps Mbps mbps Kbps kbps ƒ ppi px "
|
||||
"Hz kHz MHz GHz mAh "
|
||||
"ratus rb ribu ribuan "
|
||||
"juta jt jutaan mill?iar million bil[l]?iun bilyun billion "
|
||||
)
|
||||
_currency = _currency + r" USD RM MYR Rp IDR RMB SGD S\$"
|
||||
_months = (
|
||||
"Januari Februari Mac April Mei Jun Julai Ogos September "
|
||||
"Oktober November Disember Januari Februari Mac Mei Jun "
|
||||
"Julai Ogos Oktober Disember Jan Feb Mac Jun Julai Ogos Sept "
|
||||
"Okt Nov Dis"
|
||||
)
|
||||
|
||||
|
||||
UNITS = merge_chars(_units)
|
||||
CURRENCY = merge_chars(_currency)
|
||||
HTML_PREFIX = r"<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>"
|
||||
HTML_SUFFIX = r"</(b|strong|i|em|p|span|div|a)>"
|
||||
MONTHS = merge_chars(_months)
|
||||
LIST_CURRENCY = split_chars(_currency)
|
||||
|
||||
_prefixes = list(TOKENIZER_PREFIXES)
|
||||
_prefixes.remove("#") # hashtag
|
||||
_prefixes = _prefixes + LIST_CURRENCY + [HTML_PREFIX] + ["/", "—"]
|
||||
|
||||
_suffixes = (
|
||||
TOKENIZER_SUFFIXES
|
||||
+ [r"\-[Nn]ya", "-[KkMm]u", "[—-]"]
|
||||
+ [
|
||||
# disabled: variable width currency variable
|
||||
# r"(?<={c})(?:[0-9]+)".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9])%",
|
||||
# disabled: variable width HTML_SUFFIX variable
|
||||
# r"(?<=[0-9{a}]{h})(?:[\.,:-])".format(a=ALPHA, h=HTML_SUFFIX),
|
||||
r"(?<=[0-9{a}])(?:{h})".format(a=ALPHA, h=HTML_SUFFIX),
|
||||
]
|
||||
)
|
||||
|
||||
_infixes = TOKENIZER_INFIXES + [
|
||||
r"(?<=[0-9])[\\/](?=[0-9%-])",
|
||||
r"(?<=[0-9])%(?=[{a}0-9/])".format(a=ALPHA),
|
||||
# disabled: variable width units variable
|
||||
# r"(?<={u})[\/-](?=[0-9])".format(u=UNITS),
|
||||
# disabled: variable width months variable
|
||||
# r"(?<={m})[\/-](?=[0-9])".format(m=MONTHS),
|
||||
r'(?<=[0-9)][.,])"(?=[0-9])',
|
||||
r'(?<=[{a})][.,\'])["—](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}])-(?=[0-9])".format(a=ALPHA),
|
||||
r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[\/-](?={c}|[{a}])".format(a=ALPHA, c=CURRENCY),
|
||||
]
|
||||
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
118
spacy/lang/ms/stop_words.py
Normal file
118
spacy/lang/ms/stop_words.py
Normal file
|
@ -0,0 +1,118 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya
|
||||
aku akulah amat amatlah anda andalah antar antara antaranya apa apaan apabila
|
||||
apakah apalagi apatah artinya asal asalkan atas atau ataukah ataupun awal
|
||||
awalnya
|
||||
|
||||
bagai bagaikan bagaimana bagaimanakah bagaimanapun bagi bagian bahkan bahwa
|
||||
bahwasanya baik bakal bakalan balik banyak bapak baru bawah beberapa begini
|
||||
beginian beginikah beginilah begitu begitukah begitulah begitupun bekerja
|
||||
belakang belakangan belum belumlah benar benarkah benarlah berada berakhir
|
||||
berakhirlah berakhirnya berapa berapakah berapalah berapapun berarti berawal
|
||||
berbagai berdatangan beri berikan berikut berikutnya berjumlah berkali-kali
|
||||
berkata berkehendak berkeinginan berkenaan berlainan berlalu berlangsung
|
||||
berlebihan bermacam bermacam-macam bermaksud bermula bersama bersama-sama
|
||||
bersiap bersiap-siap bertanya bertanya-tanya berturut berturut-turut bertutur
|
||||
berujar berupa besar betul betulkah biasa biasanya bila bilakah bisa bisakah
|
||||
boleh bolehkah bolehlah buat bukan bukankah bukanlah bukannya bulan bung
|
||||
|
||||
cara caranya cukup cukupkah cukuplah cuma
|
||||
|
||||
dahulu dalam dan dapat dari daripada datang dekat demi demikian demikianlah
|
||||
dengan depan di dia diakhiri diakhirinya dialah diantara diantaranya diberi
|
||||
diberikan diberikannya dibuat dibuatnya didapat didatangkan digunakan
|
||||
diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab dijelaskan
|
||||
dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan diketahui
|
||||
diketahuinya dikira dilakukan dilalui dilihat dimaksud dimaksudkan
|
||||
dimaksudkannya dimaksudnya diminta dimintai dimisalkan dimulai dimulailah
|
||||
dimulainya dimungkinkan dini dipastikan diperbuat diperbuatnya dipergunakan
|
||||
diperkirakan diperlihatkan diperlukan diperlukannya dipersoalkan dipertanyakan
|
||||
dipunyai diri dirinya disampaikan disebut disebutkan disebutkannya disini
|
||||
disinilah ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
|
||||
ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya dituturkan
|
||||
dituturkannya diucapkan diucapkannya diungkapkan dong dua dulu
|
||||
|
||||
empat enggak enggaknya entah entahlah
|
||||
|
||||
guna gunakan
|
||||
|
||||
hal hampir hanya hanyalah hari harus haruslah harusnya hendak hendaklah
|
||||
hendaknya hingga
|
||||
|
||||
ia ialah ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
|
||||
inginkan ini inikah inilah itu itukah itulah
|
||||
|
||||
jadi jadilah jadinya jangan jangankan janganlah jauh jawab jawaban jawabnya
|
||||
jelas jelaskan jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru
|
||||
|
||||
kala kalau kalaulah kalaupun kalian kami kamilah kamu kamulah kan kapan
|
||||
kapankah kapanpun karena karenanya kasus kata katakan katakanlah katanya ke
|
||||
keadaan kebetulan kecil kedua keduanya keinginan kelamaan kelihatan
|
||||
kelihatannya kelima keluar kembali kemudian kemungkinan kemungkinannya kenapa
|
||||
kepada kepadanya kesampaian keseluruhan keseluruhannya keterlaluan ketika
|
||||
khususnya kini kinilah kira kira-kira kiranya kita kitalah kok kurang
|
||||
|
||||
lagi lagian lah lain lainnya lalu lama lamanya lanjut lanjutnya lebih lewat
|
||||
lima luar
|
||||
|
||||
macam maka makanya makin malah malahan mampu mampukah mana manakala manalagi
|
||||
masa masalah masalahnya masih masihkah masing masing-masing mau maupun
|
||||
melainkan melakukan melalui melihat melihatnya memang memastikan memberi
|
||||
memberikan membuat memerlukan memihak meminta memintakan memisalkan memperbuat
|
||||
mempergunakan memperkirakan memperlihatkan mempersiapkan mempersoalkan
|
||||
mempertanyakan mempunyai memulai memungkinkan menaiki menambahkan menandaskan
|
||||
menanti menanti-nanti menantikan menanya menanyai menanyakan mendapat
|
||||
mendapatkan mendatang mendatangi mendatangkan menegaskan mengakhiri mengapa
|
||||
mengatakan mengatakannya mengenai mengerjakan mengetahui menggunakan
|
||||
menghendaki mengibaratkan mengibaratkannya mengingat mengingatkan menginginkan
|
||||
mengira mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
|
||||
menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
|
||||
menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan merasa
|
||||
mereka merekalah merupakan meski meskipun meyakini meyakinkan minta mirip
|
||||
misal misalkan misalnya mula mulai mulailah mulanya mungkin mungkinkah
|
||||
|
||||
nah naik namun nanti nantinya nyaris nyatanya
|
||||
|
||||
oleh olehnya
|
||||
|
||||
pada padahal padanya pak paling panjang pantas para pasti pastilah penting
|
||||
pentingnya per percuma perlu perlukah perlunya pernah persoalan pertama
|
||||
pertama-tama pertanyaan pertanyakan pihak pihaknya pukul pula pun punya
|
||||
|
||||
rasa rasanya rata rupanya
|
||||
|
||||
saat saatnya saja sajalah saling sama sama-sama sambil sampai sampai-sampai
|
||||
sampaikan sana sangat sangatlah satu saya sayalah se sebab sebabnya sebagai
|
||||
sebagaimana sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
|
||||
sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa sebesar
|
||||
sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara secukupnya sedang
|
||||
sedangkan sedemikian sedikit sedikitnya seenaknya segala segalanya segera
|
||||
seharusnya sehingga seingat sejak sejauh sejenak sejumlah sekadar sekadarnya
|
||||
sekali sekali-kali sekalian sekaligus sekalipun sekarang sekarang sekecil
|
||||
seketika sekiranya sekitar sekitarnya sekurang-kurangnya sekurangnya sela
|
||||
selain selaku selalu selama selama-lamanya selamanya selanjutnya seluruh
|
||||
seluruhnya semacam semakin semampu semampunya semasa semasih semata semata-mata
|
||||
semaunya sementara semisal semisalnya sempat semua semuanya semula sendiri
|
||||
sendirian sendirinya seolah seolah-olah seorang sepanjang sepantasnya
|
||||
sepantasnyalah seperlunya seperti sepertinya sepihak sering seringnya serta
|
||||
serupa sesaat sesama sesampai sesegera sesekali seseorang sesuatu sesuatunya
|
||||
sesudah sesudahnya setelah setempat setengah seterusnya setiap setiba setibanya
|
||||
setidak-tidaknya setidaknya setinggi seusai sewaktu siap siapa siapakah
|
||||
siapapun sini sinilah soal soalnya suatu sudah sudahkah sudahlah supaya
|
||||
|
||||
tadi tadinya tahu tahun tak tambah tambahnya tampak tampaknya tandas tandasnya
|
||||
tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tengah tentang
|
||||
tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu terdapat
|
||||
terdiri terhadap terhadapnya teringat teringat-ingat terjadi terjadilah
|
||||
terjadinya terkira terlalu terlebih terlihat termasuk ternyata tersampaikan
|
||||
tersebut tersebutlah tertentu tertuju terus terutama tetap tetapi tiap tiba
|
||||
tiba-tiba tidak tidakkah tidaklah tiga tinggi toh tunjuk turut tutur tuturnya
|
||||
|
||||
ucap ucapnya ujar ujarnya umum umumnya ungkap ungkapnya untuk usah usai
|
||||
|
||||
waduh wah wahai waktu waktunya walau walaupun wong
|
||||
|
||||
yaitu yakin yakni yang
|
||||
""".split()
|
||||
)
|
41
spacy/lang/ms/syntax_iterators.py
Normal file
41
spacy/lang/ms/syntax_iterators.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
from typing import Union, Iterator, Tuple
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
from ...errors import Errors
|
||||
from ...tokens import Doc, Span
|
||||
|
||||
|
||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
# fmt: off
|
||||
labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
|
||||
# fmt: on
|
||||
doc = doclike.doc # Ensure works on both Doc and Span.
|
||||
if not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E029)
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add("conj")
|
||||
np_label = doc.vocab.strings.add("NP")
|
||||
prev_end = -1
|
||||
for i, word in enumerate(doclike):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.left_edge.i <= prev_end:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
prev_end = word.right_edge.i
|
||||
yield word.left_edge.i, word.right_edge.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
1533
spacy/lang/ms/tokenizer_exceptions.py
Normal file
1533
spacy/lang/ms/tokenizer_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -1,11 +1,14 @@
|
|||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from ...language import Language, BaseDefaults
|
||||
|
||||
|
||||
class SerbianDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
infixes = TOKENIZER_INFIXES
|
||||
suffixes = TOKENIZER_SUFFIXES
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
|
36
spacy/lang/sr/punctuation.py
Normal file
36
spacy/lang/sr/punctuation.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
|
||||
from ..char_classes import CURRENCY, UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = (
|
||||
LIST_PUNCT
|
||||
+ LIST_ELLIPSES
|
||||
+ LIST_QUOTES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[{a}{e}{p}(?:{q})])\.".format(
|
||||
a=ALPHA, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -6,10 +6,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language, BaseDefaults
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
# Punctuation stolen from Danish
|
||||
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
class SwedishDefaults(BaseDefaults):
|
||||
|
|
33
spacy/lang/sv/punctuation.py
Normal file
33
spacy/lang/sv/punctuation.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
from ..punctuation import TOKENIZER_SUFFIXES
|
||||
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])[<>=](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])[<>=/](?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9]):(?=[{a}])".format(a=ALPHA_UPPER),
|
||||
]
|
||||
)
|
||||
|
||||
_suffixes = [
|
||||
suffix
|
||||
for suffix in TOKENIZER_SUFFIXES
|
||||
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
||||
]
|
||||
_suffixes += [r"(?<=[^sSxXzZ])\'"]
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
|
@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
|||
|
||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||
def load_lookups_data(lang, tables):
|
||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||
lookups = load_lookups(lang=lang, tables=tables)
|
||||
return lookups
|
||||
|
||||
|
@ -1372,6 +1372,7 @@ class Language:
|
|||
scorer: Optional[Scorer] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||||
per_component: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""Evaluate a model's pipeline components.
|
||||
|
||||
|
@ -1383,6 +1384,8 @@ class Language:
|
|||
arguments for specific components.
|
||||
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
||||
for the scorer.
|
||||
per_component (bool): Whether to return the scores keyed by component
|
||||
name. Defaults to False.
|
||||
|
||||
RETURNS (Scorer): The scorer containing the evaluation results.
|
||||
|
||||
|
@ -1415,7 +1418,7 @@ class Language:
|
|||
for eg, doc in zip(examples, docs):
|
||||
eg.predicted = doc
|
||||
end_time = timer()
|
||||
results = scorer.score(examples)
|
||||
results = scorer.score(examples, per_component=per_component)
|
||||
n_words = sum(len(eg.predicted) for eg in examples)
|
||||
results["speed"] = n_words / (end_time - start_time)
|
||||
return results
|
||||
|
@ -1969,7 +1972,7 @@ class Language:
|
|||
pipe = self.get_pipe(pipe_name)
|
||||
pipe_cfg = self._pipe_configs[pipe_name]
|
||||
if listeners:
|
||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
||||
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||||
if len(list(listeners)) != len(pipe_listeners):
|
||||
# The number of listeners defined in the component model doesn't
|
||||
# match the listeners to replace, so we won't be able to update
|
||||
|
|
|
@ -25,7 +25,8 @@ class Lexeme:
|
|||
def orth_(self) -> str: ...
|
||||
@property
|
||||
def text(self) -> str: ...
|
||||
lower: str
|
||||
orth: int
|
||||
lower: int
|
||||
norm: int
|
||||
shape: int
|
||||
prefix: int
|
||||
|
|
|
@ -199,7 +199,7 @@ cdef class Lexeme:
|
|||
return self.orth_
|
||||
|
||||
property lower:
|
||||
"""RETURNS (str): Lowercase form of the lexeme."""
|
||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.lower
|
||||
|
||||
|
|
|
@ -82,8 +82,12 @@ cdef class DependencyMatcher:
|
|||
"$-": self._imm_left_sib,
|
||||
"$++": self._right_sib,
|
||||
"$--": self._left_sib,
|
||||
">+": self._imm_right_child,
|
||||
">-": self._imm_left_child,
|
||||
">++": self._right_child,
|
||||
">--": self._left_child,
|
||||
"<+": self._imm_right_parent,
|
||||
"<-": self._imm_left_parent,
|
||||
"<++": self._right_parent,
|
||||
"<--": self._left_parent,
|
||||
}
|
||||
|
@ -427,11 +431,33 @@ cdef class DependencyMatcher:
|
|||
def _left_sib(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].head.children if child.i < node]
|
||||
|
||||
def _imm_right_child(self, doc, node):
|
||||
for child in doc[node].rights:
|
||||
if child.i == node + 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _imm_left_child(self, doc, node):
|
||||
for child in doc[node].lefts:
|
||||
if child.i == node - 1:
|
||||
return [doc[child.i]]
|
||||
return []
|
||||
|
||||
def _right_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i > node]
|
||||
return [child for child in doc[node].rights]
|
||||
|
||||
def _left_child(self, doc, node):
|
||||
return [doc[child.i] for child in doc[node].children if child.i < node]
|
||||
return [child for child in doc[node].lefts]
|
||||
|
||||
def _imm_right_parent(self, doc, node):
|
||||
if doc[node].head.i == node + 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _imm_left_parent(self, doc, node):
|
||||
if doc[node].head.i == node - 1:
|
||||
return [doc[node].head]
|
||||
return []
|
||||
|
||||
def _right_parent(self, doc, node):
|
||||
if doc[node].head.i > node:
|
||||
|
|
|
@ -828,6 +828,11 @@ def _get_attr_values(spec, string_store):
|
|||
return attr_values
|
||||
|
||||
|
||||
def _predicate_cache_key(attr, predicate, value, *, regex=False, fuzzy=None):
|
||||
# tuple order affects performance
|
||||
return (attr, regex, fuzzy, predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
|
||||
|
||||
# These predicate helper classes are used to match the REGEX, IN, >= etc
|
||||
# extensions to the matcher introduced in #3173.
|
||||
|
||||
|
@ -847,7 +852,7 @@ class _FuzzyPredicate:
|
|||
fuzz = self.predicate[len("FUZZY"):] # number after prefix
|
||||
self.fuzzy = int(fuzz) if fuzz else -1
|
||||
self.fuzzy_compare = fuzzy_compare
|
||||
self.key = (self.attr, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, fuzzy=self.fuzzy)
|
||||
|
||||
def __call__(self, Token token):
|
||||
if self.is_extension:
|
||||
|
@ -869,7 +874,7 @@ class _RegexPredicate:
|
|||
self.value = re.compile(value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -905,7 +910,7 @@ class _SetPredicate:
|
|||
self.value = set(get_string_id(v) for v in value)
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.regex, self.fuzzy, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value, regex=self.regex, fuzzy=self.fuzzy)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -977,7 +982,7 @@ class _ComparisonPredicate:
|
|||
self.value = value
|
||||
self.predicate = predicate
|
||||
self.is_extension = is_extension
|
||||
self.key = (self.attr, self.predicate, srsly.json_dumps(value, sort_keys=True))
|
||||
self.key = _predicate_cache_key(self.attr, self.predicate, value)
|
||||
if self.predicate not in self.operators:
|
||||
raise ValueError(Errors.E126.format(good=self.operators, bad=self.predicate))
|
||||
|
||||
|
@ -1092,7 +1097,7 @@ def _get_extension_extra_predicates(spec, extra_predicates, predicate_types,
|
|||
if isinstance(value, dict):
|
||||
for type_, cls in predicate_types.items():
|
||||
if type_ in value:
|
||||
key = (attr, type_, srsly.json_dumps(value[type_], sort_keys=True))
|
||||
key = _predicate_cache_key(attr, type_, value[type_])
|
||||
if key in seen_predicates:
|
||||
output.append(seen_predicates[key])
|
||||
else:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Tuple, Callable
|
||||
from typing import List, Tuple, Callable
|
||||
from thinc.api import Model, to_numpy
|
||||
from thinc.types import Ragged, Ints1d
|
||||
|
||||
|
@ -52,14 +52,14 @@ def _get_span_indices(ops, spans: Ragged, lengths: Ints1d) -> Ints1d:
|
|||
indices will be [5, 6, 7, 8, 8, 9].
|
||||
"""
|
||||
spans, lengths = _ensure_cpu(spans, lengths)
|
||||
indices = []
|
||||
indices: List[int] = []
|
||||
offset = 0
|
||||
for i, length in enumerate(lengths):
|
||||
spans_i = spans[i].dataXd + offset
|
||||
for j in range(spans_i.shape[0]):
|
||||
indices.append(ops.xp.arange(spans_i[j, 0], spans_i[j, 1])) # type: ignore[call-overload, index]
|
||||
indices.extend(range(spans_i[j, 0], spans_i[j, 1])) # type: ignore[arg-type, call-overload]
|
||||
offset += length
|
||||
return ops.flatten(indices, dtype="i", ndim_if_empty=1)
|
||||
return ops.asarray1i(indices)
|
||||
|
||||
|
||||
def _ensure_cpu(spans: Ragged, lengths: Ints1d) -> Tuple[Ragged, Ints1d]:
|
||||
|
|
|
@ -89,6 +89,14 @@ def load_kb(
|
|||
return kb_from_file
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v2")
|
||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
|
||||
@registry.misc("spacy.EmptyKB.v1")
|
||||
def empty_kb(
|
||||
entity_vector_length: int,
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Any, Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING, cast
|
||||
from thinc.types import Floats2d
|
||||
from thinc.types import Floats2d, Ints1d
|
||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||
from thinc.api import MultiSoftmax, list2array
|
||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||
|
@ -7,7 +7,8 @@ from thinc.loss import Loss
|
|||
|
||||
from ...util import registry, OOV_RANK
|
||||
from ...errors import Errors
|
||||
from ...attrs import ID
|
||||
from ...attrs import ID, ORTH
|
||||
from ...vectors import Mode as VectorsMode
|
||||
|
||||
import numpy
|
||||
from functools import partial
|
||||
|
@ -67,14 +68,23 @@ def get_vectors_loss(ops, docs, prediction, distance):
|
|||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
target[ids == OOV_RANK] = 0
|
||||
d_target, loss = distance(prediction, target)
|
||||
vocab = docs[0].vocab
|
||||
if vocab.vectors.mode == VectorsMode.default:
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our
|
||||
# tokens, and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
target[ids == OOV_RANK] = 0
|
||||
d_target, loss = distance(prediction, target)
|
||||
elif vocab.vectors.mode == VectorsMode.floret:
|
||||
keys = ops.flatten([cast(Ints1d, doc.to_array(ORTH)) for doc in docs])
|
||||
target = vocab.vectors.get_batch(keys)
|
||||
target = ops.as_contig(target)
|
||||
d_target, loss = distance(prediction, target)
|
||||
else:
|
||||
raise ValueError(Errors.E850.format(mode=vocab.vectors.mode))
|
||||
return loss, d_target
|
||||
|
||||
|
||||
|
|
|
@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"entity_vector_length": 64,
|
||||
"get_candidates": {"@misc": "spacy.CandidateGenerator.v1"},
|
||||
"get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"},
|
||||
"generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"},
|
||||
"overwrite": True,
|
||||
"scorer": {"@scorers": "spacy.entity_linker_scorer.v1"},
|
||||
"use_gold_ents": True,
|
||||
|
@ -80,6 +81,7 @@ def make_entity_linker(
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
use_gold_ents: bool,
|
||||
|
@ -101,6 +103,7 @@ def make_entity_linker(
|
|||
get_candidates_batch (
|
||||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -135,6 +138,7 @@ def make_entity_linker(
|
|||
entity_vector_length=entity_vector_length,
|
||||
get_candidates=get_candidates,
|
||||
get_candidates_batch=get_candidates_batch,
|
||||
generate_empty_kb=generate_empty_kb,
|
||||
overwrite=overwrite,
|
||||
scorer=scorer,
|
||||
use_gold_ents=use_gold_ents,
|
||||
|
@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe):
|
|||
get_candidates_batch: Callable[
|
||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||
],
|
||||
generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
scorer: Optional[Callable] = entity_linker_score,
|
||||
use_gold_ents: bool,
|
||||
|
@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe):
|
|||
Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
|
||||
Iterable[Candidate]]
|
||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
|
||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
||||
component must provide entity annotations.
|
||||
|
@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self.labels_discard = list(labels_discard)
|
||||
# how many neighbour sentences to take into account
|
||||
self.n_sents = n_sents
|
||||
self.incl_prior = incl_prior
|
||||
self.incl_context = incl_context
|
||||
|
@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe):
|
|||
self.get_candidates_batch = get_candidates_batch
|
||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||
self.distance = CosineDistance(normalize=False)
|
||||
# how many neighbour sentences to take into account
|
||||
# create an empty KB by default
|
||||
self.kb = empty_kb(entity_vector_length)(self.vocab)
|
||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||
self.scorer = scorer
|
||||
self.use_gold_ents = use_gold_ents
|
||||
self.candidates_batch_size = candidates_batch_size
|
||||
|
@ -250,7 +255,7 @@ class EntityLinker(TrainablePipe):
|
|||
# Raise an error if the knowledge base is not initialized.
|
||||
if self.kb is None:
|
||||
raise ValueError(Errors.E1018.format(name=self.name))
|
||||
if len(self.kb) == 0:
|
||||
if hasattr(self.kb, "is_empty") and self.kb.is_empty():
|
||||
raise ValueError(Errors.E139.format(name=self.name))
|
||||
|
||||
def initialize(
|
||||
|
@ -469,18 +474,24 @@ class EntityLinker(TrainablePipe):
|
|||
|
||||
# Looping through each entity in batch (TODO: rewrite)
|
||||
for j, ent in enumerate(ent_batch):
|
||||
sent_index = sentences.index(ent.sent)
|
||||
assert sent_index >= 0
|
||||
assert hasattr(ent, "sents")
|
||||
sents = list(ent.sents)
|
||||
sent_indices = (
|
||||
sentences.index(sents[0]),
|
||||
sentences.index(sents[-1]),
|
||||
)
|
||||
assert sent_indices[1] >= sent_indices[0] >= 0
|
||||
|
||||
if self.incl_context:
|
||||
# get n_neighbour sentences, clipped to the length of the document
|
||||
start_sentence = max(0, sent_index - self.n_sents)
|
||||
start_sentence = max(0, sent_indices[0] - self.n_sents)
|
||||
end_sentence = min(
|
||||
len(sentences) - 1, sent_index + self.n_sents
|
||||
len(sentences) - 1, sent_indices[1] + self.n_sents
|
||||
)
|
||||
start_token = sentences[start_sentence].start
|
||||
end_token = sentences[end_sentence].end
|
||||
sent_doc = doc[start_token:end_token].as_doc()
|
||||
|
||||
# currently, the context is the same for each entity in a sentence (should be refined)
|
||||
sentence_encoding = self.model.predict([sent_doc])[0]
|
||||
sentence_encoding_t = sentence_encoding.T
|
||||
|
|
|
@ -52,7 +52,8 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False, "scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}},
|
||||
default_config={"model": DEFAULT_MORPH_MODEL, "overwrite": True, "extend": False,
|
||||
"scorer": {"@scorers": "spacy.morphologizer_scorer.v1"}, "label_smoothing": 0.0},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
|
@ -61,9 +62,10 @@ def make_morphologizer(
|
|||
name: str,
|
||||
overwrite: bool,
|
||||
extend: bool,
|
||||
label_smoothing: float,
|
||||
scorer: Optional[Callable],
|
||||
):
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, scorer=scorer)
|
||||
return Morphologizer(nlp.vocab, model, name, overwrite=overwrite, extend=extend, label_smoothing=label_smoothing, scorer=scorer)
|
||||
|
||||
|
||||
def morphologizer_score(examples, **kwargs):
|
||||
|
@ -94,6 +96,7 @@ class Morphologizer(Tagger):
|
|||
*,
|
||||
overwrite: bool = BACKWARD_OVERWRITE,
|
||||
extend: bool = BACKWARD_EXTEND,
|
||||
label_smoothing: float = 0.0,
|
||||
scorer: Optional[Callable] = morphologizer_score,
|
||||
):
|
||||
"""Initialize a morphologizer.
|
||||
|
@ -121,6 +124,7 @@ class Morphologizer(Tagger):
|
|||
"labels_pos": {},
|
||||
"overwrite": overwrite,
|
||||
"extend": extend,
|
||||
"label_smoothing": label_smoothing,
|
||||
}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
@ -270,7 +274,8 @@ class Morphologizer(Tagger):
|
|||
DOCS: https://spacy.io/api/morphologizer#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Morphologizer.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False,
|
||||
label_smoothing=self.cfg["label_smoothing"])
|
||||
truths = []
|
||||
for eg in examples:
|
||||
eg_truths = []
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any
|
||||
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union
|
||||
from dataclasses import dataclass
|
||||
from functools import partial
|
||||
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
|
||||
from thinc.api import Optimizer
|
||||
from thinc.types import Ragged, Ints2d, Floats2d
|
||||
|
@ -43,7 +45,36 @@ maxout_pieces = 3
|
|||
depth = 4
|
||||
"""
|
||||
|
||||
spancat_singlelabel_default_config = """
|
||||
[model]
|
||||
@architectures = "spacy.SpanCategorizer.v1"
|
||||
scorer = {"@layers": "Softmax.v2"}
|
||||
|
||||
[model.reducer]
|
||||
@layers = spacy.mean_max_reducer.v1
|
||||
hidden_size = 128
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
[model.tok2vec.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = 96
|
||||
rows = [5000, 1000, 2500, 1000]
|
||||
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
|
||||
include_static_vectors = false
|
||||
|
||||
[model.tok2vec.encode]
|
||||
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
||||
width = ${model.tok2vec.embed.width}
|
||||
window_size = 1
|
||||
maxout_pieces = 3
|
||||
depth = 4
|
||||
"""
|
||||
|
||||
DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"]
|
||||
DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str(
|
||||
spancat_singlelabel_default_config
|
||||
)["model"]
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
|
@ -52,39 +83,42 @@ class Suggester(Protocol):
|
|||
...
|
||||
|
||||
|
||||
def ngram_suggester(
|
||||
docs: Iterable[Doc], sizes: List[int], *, ops: Optional[Ops] = None
|
||||
) -> Ragged:
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
starts = ops.xp.arange(len(doc), dtype="i")
|
||||
starts = starts.reshape((-1, 1))
|
||||
length = 0
|
||||
for size in sizes:
|
||||
if size <= len(doc):
|
||||
starts_size = starts[: len(doc) - (size - 1)]
|
||||
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||
length += spans[-1].shape[0]
|
||||
if spans:
|
||||
assert spans[-1].ndim == 2, spans[-1].shape
|
||||
lengths.append(length)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
|
||||
assert output.dataXd.ndim == 2
|
||||
return output
|
||||
|
||||
|
||||
@registry.misc("spacy.ngram_suggester.v1")
|
||||
def build_ngram_suggester(sizes: List[int]) -> Suggester:
|
||||
"""Suggest all spans of the given lengths. Spans are returned as a ragged
|
||||
array of integers. The array has two columns, indicating the start and end
|
||||
position."""
|
||||
|
||||
def ngram_suggester(docs: Iterable[Doc], *, ops: Optional[Ops] = None) -> Ragged:
|
||||
if ops is None:
|
||||
ops = get_current_ops()
|
||||
spans = []
|
||||
lengths = []
|
||||
for doc in docs:
|
||||
starts = ops.xp.arange(len(doc), dtype="i")
|
||||
starts = starts.reshape((-1, 1))
|
||||
length = 0
|
||||
for size in sizes:
|
||||
if size <= len(doc):
|
||||
starts_size = starts[: len(doc) - (size - 1)]
|
||||
spans.append(ops.xp.hstack((starts_size, starts_size + size)))
|
||||
length += spans[-1].shape[0]
|
||||
if spans:
|
||||
assert spans[-1].ndim == 2, spans[-1].shape
|
||||
lengths.append(length)
|
||||
lengths_array = ops.asarray1i(lengths)
|
||||
if len(spans) > 0:
|
||||
output = Ragged(ops.xp.vstack(spans), lengths_array)
|
||||
else:
|
||||
output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
|
||||
|
||||
assert output.dataXd.ndim == 2
|
||||
return output
|
||||
|
||||
return ngram_suggester
|
||||
return partial(ngram_suggester, sizes=sizes)
|
||||
|
||||
|
||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
||||
|
@ -119,10 +153,14 @@ def make_spancat(
|
|||
threshold: float,
|
||||
max_positive: Optional[int],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component. The span categorizer consists of two
|
||||
"""Create a SpanCategorizer component and configure it for multi-label
|
||||
classification to be able to assign multiple labels for each span.
|
||||
The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
|
@ -144,12 +182,80 @@ def make_spancat(
|
|||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
suggester=suggester,
|
||||
model=model,
|
||||
spans_key=spans_key,
|
||||
threshold=threshold,
|
||||
max_positive=max_positive,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=None,
|
||||
allow_overlap=True,
|
||||
max_positive=max_positive,
|
||||
threshold=threshold,
|
||||
scorer=scorer,
|
||||
add_negative_label=False,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"spancat_singlelabel",
|
||||
assigns=["doc.spans"],
|
||||
default_config={
|
||||
"spans_key": "sc",
|
||||
"model": DEFAULT_SPANCAT_SINGLELABEL_MODEL,
|
||||
"negative_weight": 1.0,
|
||||
"suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]},
|
||||
"scorer": {"@scorers": "spacy.spancat_scorer.v1"},
|
||||
"allow_overlap": True,
|
||||
},
|
||||
default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0},
|
||||
)
|
||||
def make_spancat_singlelabel(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
suggester: Suggester,
|
||||
model: Model[Tuple[List[Doc], Ragged], Floats2d],
|
||||
spans_key: str,
|
||||
negative_weight: float,
|
||||
allow_overlap: bool,
|
||||
scorer: Optional[Callable],
|
||||
) -> "SpanCategorizer":
|
||||
"""Create a SpanCategorizer component and configure it for multi-class
|
||||
classification. With this configuration each span can get at most one
|
||||
label. The span categorizer consists of two
|
||||
parts: a suggester function that proposes candidate spans, and a labeller
|
||||
model that predicts one or more labels for each span.
|
||||
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that
|
||||
is given a list of documents and (start, end) indices representing
|
||||
candidate span offsets. The model predicts a probability for each category
|
||||
for each span.
|
||||
spans_key (str): Key of the doc.spans dict to save the spans under. During
|
||||
initialization and training, the component will look for spans on the
|
||||
reference document under the same key.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores.
|
||||
"""
|
||||
return SpanCategorizer(
|
||||
nlp.vocab,
|
||||
model=model,
|
||||
suggester=suggester,
|
||||
name=name,
|
||||
spans_key=spans_key,
|
||||
negative_weight=negative_weight,
|
||||
allow_overlap=allow_overlap,
|
||||
max_positive=1,
|
||||
add_negative_label=True,
|
||||
threshold=None,
|
||||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
@ -172,6 +278,27 @@ def make_spancat_scorer():
|
|||
return spancat_score
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Intervals:
|
||||
"""
|
||||
Helper class to avoid storing overlapping spans.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.ranges = set()
|
||||
|
||||
def add(self, i, j):
|
||||
for e in range(i, j):
|
||||
self.ranges.add(e)
|
||||
|
||||
def __contains__(self, rang):
|
||||
i, j = rang
|
||||
for e in range(i, j):
|
||||
if e in self.ranges:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class SpanCategorizer(TrainablePipe):
|
||||
"""Pipeline component to label spans of text.
|
||||
|
||||
|
@ -185,25 +312,43 @@ class SpanCategorizer(TrainablePipe):
|
|||
suggester: Suggester,
|
||||
name: str = "spancat",
|
||||
*,
|
||||
add_negative_label: bool = False,
|
||||
spans_key: str = "spans",
|
||||
threshold: float = 0.5,
|
||||
negative_weight: Optional[float] = 1.0,
|
||||
allow_overlap: Optional[bool] = True,
|
||||
max_positive: Optional[int] = None,
|
||||
threshold: Optional[float] = 0.5,
|
||||
scorer: Optional[Callable] = spancat_score,
|
||||
) -> None:
|
||||
"""Initialize the span categorizer.
|
||||
"""Initialize the multi-label or multi-class span categorizer.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
||||
For multi-class classification (single label per span) we recommend
|
||||
using a Softmax classifier as a the final layer, while for multi-label
|
||||
classification (multiple possible labels per span) we recommend Logistic.
|
||||
suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans.
|
||||
Spans are returned as a ragged array with two integer columns, for the
|
||||
start and end positions.
|
||||
name (str): The component instance name, used to add entries to the
|
||||
losses during training.
|
||||
spans_key (str): Key of the Doc.spans dict to save the spans under.
|
||||
During initialization and training, the component will look for
|
||||
spans on the reference document under the same key. Defaults to
|
||||
`"spans"`.
|
||||
threshold (float): Minimum probability to consider a prediction
|
||||
positive. Spans with a positive prediction will be saved on the Doc.
|
||||
Defaults to 0.5.
|
||||
add_negative_label (bool): Learn to predict a special 'negative_label'
|
||||
when a Span is not annotated.
|
||||
threshold (Optional[float]): Minimum probability to consider a prediction
|
||||
positive. Defaults to 0.5. Spans with a positive prediction will be saved
|
||||
on the Doc.
|
||||
max_positive (Optional[int]): Maximum number of labels to consider
|
||||
positive per span. Defaults to None, indicating no limit.
|
||||
negative_weight (float): Multiplier for the loss terms.
|
||||
Can be used to downweight the negative samples if there are too many
|
||||
when add_negative_label is True. Otherwise its unused.
|
||||
allow_overlap (bool): If True the data is assumed to contain overlapping spans.
|
||||
Otherwise it produces non-overlapping spans greedily prioritizing
|
||||
higher assigned label scores. Only used when max_positive is 1.
|
||||
scorer (Optional[Callable]): The scoring method. Defaults to
|
||||
Scorer.score_spans for the Doc.spans[spans_key] with overlapping
|
||||
spans allowed.
|
||||
|
@ -215,12 +360,17 @@ class SpanCategorizer(TrainablePipe):
|
|||
"spans_key": spans_key,
|
||||
"threshold": threshold,
|
||||
"max_positive": max_positive,
|
||||
"negative_weight": negative_weight,
|
||||
"allow_overlap": allow_overlap,
|
||||
}
|
||||
self.vocab = vocab
|
||||
self.suggester = suggester
|
||||
self.model = model
|
||||
self.name = name
|
||||
self.scorer = scorer
|
||||
self.add_negative_label = add_negative_label
|
||||
if not allow_overlap and max_positive is not None and max_positive > 1:
|
||||
raise ValueError(Errors.E1051.format(max_positive=max_positive))
|
||||
|
||||
@property
|
||||
def key(self) -> str:
|
||||
|
@ -230,6 +380,21 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return str(self.cfg["spans_key"])
|
||||
|
||||
def _allow_extra_label(self) -> None:
|
||||
"""Raise an error if the component can not add any more labels."""
|
||||
nO = None
|
||||
if self.model.has_dim("nO"):
|
||||
nO = self.model.get_dim("nO")
|
||||
elif self.model.has_ref("output_layer") and self.model.get_ref(
|
||||
"output_layer"
|
||||
).has_dim("nO"):
|
||||
nO = self.model.get_ref("output_layer").get_dim("nO")
|
||||
if nO is not None and nO == self._n_labels:
|
||||
if not self.is_resizable:
|
||||
raise ValueError(
|
||||
Errors.E922.format(name=self.name, nO=self.model.get_dim("nO"))
|
||||
)
|
||||
|
||||
def add_label(self, label: str) -> int:
|
||||
"""Add a new label to the pipe.
|
||||
|
||||
|
@ -263,6 +428,27 @@ class SpanCategorizer(TrainablePipe):
|
|||
"""
|
||||
return list(self.labels)
|
||||
|
||||
@property
|
||||
def _label_map(self) -> Dict[str, int]:
|
||||
"""RETURNS (Dict[str, int]): The label map."""
|
||||
return {label: i for i, label in enumerate(self.labels)}
|
||||
|
||||
@property
|
||||
def _n_labels(self) -> int:
|
||||
"""RETURNS (int): Number of labels."""
|
||||
if self.add_negative_label:
|
||||
return len(self.labels) + 1
|
||||
else:
|
||||
return len(self.labels)
|
||||
|
||||
@property
|
||||
def _negative_label_i(self) -> Union[int, None]:
|
||||
"""RETURNS (Union[int, None]): Index of the negative label."""
|
||||
if self.add_negative_label:
|
||||
return len(self.label_data)
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict(self, docs: Iterable[Doc]):
|
||||
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
||||
|
||||
|
@ -304,14 +490,24 @@ class SpanCategorizer(TrainablePipe):
|
|||
|
||||
DOCS: https://spacy.io/api/spancategorizer#set_annotations
|
||||
"""
|
||||
labels = self.labels
|
||||
indices, scores = indices_scores
|
||||
offset = 0
|
||||
for i, doc in enumerate(docs):
|
||||
indices_i = indices[i].dataXd
|
||||
doc.spans[self.key] = self._make_span_group(
|
||||
doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type]
|
||||
)
|
||||
allow_overlap = cast(bool, self.cfg["allow_overlap"])
|
||||
if self.cfg["max_positive"] == 1:
|
||||
doc.spans[self.key] = self._make_span_group_singlelabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
allow_overlap,
|
||||
)
|
||||
else:
|
||||
doc.spans[self.key] = self._make_span_group_multilabel(
|
||||
doc,
|
||||
indices_i,
|
||||
scores[offset : offset + indices.lengths[i]],
|
||||
)
|
||||
offset += indices.lengths[i]
|
||||
|
||||
def update(
|
||||
|
@ -371,9 +567,11 @@ class SpanCategorizer(TrainablePipe):
|
|||
spans = Ragged(
|
||||
self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths)
|
||||
)
|
||||
label_map = {label: i for i, label in enumerate(self.labels)}
|
||||
target = numpy.zeros(scores.shape, dtype=scores.dtype)
|
||||
if self.add_negative_label:
|
||||
negative_spans = numpy.ones((scores.shape[0]))
|
||||
offset = 0
|
||||
label_map = self._label_map
|
||||
for i, eg in enumerate(examples):
|
||||
# Map (start, end) offset of spans to the row in the d_scores array,
|
||||
# so that we can adjust the gradient for predictions that were
|
||||
|
@ -390,10 +588,16 @@ class SpanCategorizer(TrainablePipe):
|
|||
row = spans_index[key]
|
||||
k = label_map[gold_span.label_]
|
||||
target[row, k] = 1.0
|
||||
if self.add_negative_label:
|
||||
# delete negative label target.
|
||||
negative_spans[row] = 0.0
|
||||
# The target is a flat array for all docs. Track the position
|
||||
# we're at within the flat array.
|
||||
offset += spans.lengths[i]
|
||||
target = self.model.ops.asarray(target, dtype="f") # type: ignore
|
||||
if self.add_negative_label:
|
||||
negative_samples = numpy.nonzero(negative_spans)[0]
|
||||
target[negative_samples, self._negative_label_i] = 1.0 # type: ignore
|
||||
# The target will have the values 0 (for untrue predictions) or 1
|
||||
# (for true predictions).
|
||||
# The scores should be in the range [0, 1].
|
||||
|
@ -402,6 +606,10 @@ class SpanCategorizer(TrainablePipe):
|
|||
# If the prediction is 0.9 and it's false, the gradient will be
|
||||
# 0.9 (0.9 - 0.0)
|
||||
d_scores = scores - target
|
||||
if self.add_negative_label:
|
||||
neg_weight = cast(float, self.cfg["negative_weight"])
|
||||
if neg_weight != 1.0:
|
||||
d_scores[negative_samples] *= neg_weight
|
||||
loss = float((d_scores**2).sum())
|
||||
return loss, d_scores
|
||||
|
||||
|
@ -438,7 +646,7 @@ class SpanCategorizer(TrainablePipe):
|
|||
if subbatch:
|
||||
docs = [eg.x for eg in subbatch]
|
||||
spans = build_ngram_suggester(sizes=[1])(docs)
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels))
|
||||
Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels)
|
||||
self.model.initialize(X=(docs, spans), Y=Y)
|
||||
else:
|
||||
self.model.initialize()
|
||||
|
@ -452,31 +660,98 @@ class SpanCategorizer(TrainablePipe):
|
|||
eg.reference.spans.get(self.key, []), allow_overlap=True
|
||||
)
|
||||
|
||||
def _make_span_group(
|
||||
self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str]
|
||||
def _make_span_group_multilabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
) -> SpanGroup:
|
||||
"""Find the top-k labels for each span (k=max_positive)."""
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
max_positive = self.cfg["max_positive"]
|
||||
if scores.size == 0:
|
||||
return spans
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
threshold = self.cfg["threshold"]
|
||||
max_positive = self.cfg["max_positive"]
|
||||
|
||||
keeps = scores >= threshold
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
if max_positive is not None:
|
||||
assert isinstance(max_positive, int)
|
||||
if self.add_negative_label:
|
||||
negative_scores = numpy.copy(scores[:, self._negative_label_i])
|
||||
scores[:, self._negative_label_i] = -numpy.inf
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
scores[:, self._negative_label_i] = negative_scores
|
||||
else:
|
||||
ranked = (scores * -1).argsort() # type: ignore
|
||||
span_filter = ranked[:, max_positive:]
|
||||
for i, row in enumerate(span_filter):
|
||||
keeps[i, row] = False
|
||||
spans.attrs["scores"] = scores[keeps].flatten()
|
||||
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
keeps = self.model.ops.to_numpy(keeps)
|
||||
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
for j, keep in enumerate(keeps[i]):
|
||||
if keep:
|
||||
spans.append(Span(doc, start, end, label=labels[j]))
|
||||
|
||||
if j != self._negative_label_i:
|
||||
spans.append(Span(doc, start, end, label=self.labels[j]))
|
||||
attrs_scores.append(scores[i, j])
|
||||
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||
return spans
|
||||
|
||||
def _make_span_group_singlelabel(
|
||||
self,
|
||||
doc: Doc,
|
||||
indices: Ints2d,
|
||||
scores: Floats2d,
|
||||
allow_overlap: bool = True,
|
||||
) -> SpanGroup:
|
||||
"""Find the argmax label for each span."""
|
||||
# Handle cases when there are zero suggestions
|
||||
if scores.size == 0:
|
||||
return SpanGroup(doc, name=self.key)
|
||||
scores = self.model.ops.to_numpy(scores)
|
||||
indices = self.model.ops.to_numpy(indices)
|
||||
predicted = scores.argmax(axis=1)
|
||||
argmax_scores = numpy.take_along_axis(
|
||||
scores, numpy.expand_dims(predicted, 1), axis=1
|
||||
)
|
||||
keeps = numpy.ones(predicted.shape, dtype=bool)
|
||||
# Remove samples where the negative label is the argmax.
|
||||
if self.add_negative_label:
|
||||
keeps = numpy.logical_and(keeps, predicted != self._negative_label_i)
|
||||
# Filter samples according to threshold.
|
||||
threshold = self.cfg["threshold"]
|
||||
if threshold is not None:
|
||||
keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze())
|
||||
# Sort spans according to argmax probability
|
||||
if not allow_overlap:
|
||||
# Get the probabilities
|
||||
sort_idx = (argmax_scores.squeeze() * -1).argsort()
|
||||
argmax_scores = argmax_scores[sort_idx]
|
||||
predicted = predicted[sort_idx]
|
||||
indices = indices[sort_idx]
|
||||
keeps = keeps[sort_idx]
|
||||
seen = _Intervals()
|
||||
spans = SpanGroup(doc, name=self.key)
|
||||
attrs_scores = []
|
||||
for i in range(indices.shape[0]):
|
||||
if not keeps[i]:
|
||||
continue
|
||||
|
||||
label = predicted[i]
|
||||
start = indices[i, 0]
|
||||
end = indices[i, 1]
|
||||
|
||||
if not allow_overlap:
|
||||
if (start, end) in seen:
|
||||
continue
|
||||
else:
|
||||
seen.add(start, end)
|
||||
attrs_scores.append(argmax_scores[i])
|
||||
spans.append(Span(doc, start, end, label=self.labels[label]))
|
||||
|
||||
spans.attrs["scores"] = numpy.array(attrs_scores)
|
||||
return spans
|
||||
|
|
|
@ -45,7 +45,7 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
@Language.factory(
|
||||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!"},
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL, "overwrite": False, "scorer": {"@scorers": "spacy.tagger_scorer.v1"}, "neg_prefix": "!", "label_smoothing": 0.0},
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(
|
||||
|
@ -55,6 +55,7 @@ def make_tagger(
|
|||
overwrite: bool,
|
||||
scorer: Optional[Callable],
|
||||
neg_prefix: str,
|
||||
label_smoothing: float,
|
||||
):
|
||||
"""Construct a part-of-speech tagger component.
|
||||
|
||||
|
@ -63,7 +64,7 @@ def make_tagger(
|
|||
in size, and be normalized as probabilities (all scores between 0 and 1,
|
||||
with the rows summing to 1).
|
||||
"""
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix)
|
||||
return Tagger(nlp.vocab, model, name, overwrite=overwrite, scorer=scorer, neg_prefix=neg_prefix, label_smoothing=label_smoothing)
|
||||
|
||||
|
||||
def tagger_score(examples, **kwargs):
|
||||
|
@ -89,6 +90,7 @@ class Tagger(TrainablePipe):
|
|||
overwrite=BACKWARD_OVERWRITE,
|
||||
scorer=tagger_score,
|
||||
neg_prefix="!",
|
||||
label_smoothing=0.0,
|
||||
):
|
||||
"""Initialize a part-of-speech tagger.
|
||||
|
||||
|
@ -105,7 +107,7 @@ class Tagger(TrainablePipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._rehearsal_model = None
|
||||
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix}
|
||||
cfg = {"labels": [], "overwrite": overwrite, "neg_prefix": neg_prefix, "label_smoothing": label_smoothing}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
self.scorer = scorer
|
||||
|
||||
|
@ -256,7 +258,7 @@ class Tagger(TrainablePipe):
|
|||
DOCS: https://spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
validate_examples(examples, "Tagger.get_loss")
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"])
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False, neg_prefix=self.cfg["neg_prefix"], label_smoothing=self.cfg["label_smoothing"])
|
||||
# Convert empty tag "" to missing value None so that both misaligned
|
||||
# tokens and tokens with missing annotation have the default missing
|
||||
# value None.
|
||||
|
|
|
@ -121,20 +121,30 @@ class Scorer:
|
|||
nlp.add_pipe(pipe)
|
||||
self.nlp = nlp
|
||||
|
||||
def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
|
||||
def score(
|
||||
self, examples: Iterable[Example], *, per_component: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""Evaluate a list of Examples.
|
||||
|
||||
examples (Iterable[Example]): The predicted annotations + correct annotations.
|
||||
per_component (bool): Whether to return the scores keyed by component
|
||||
name. Defaults to False.
|
||||
RETURNS (Dict): A dictionary of scores.
|
||||
|
||||
DOCS: https://spacy.io/api/scorer#score
|
||||
"""
|
||||
scores = {}
|
||||
if hasattr(self.nlp.tokenizer, "score"):
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||
if per_component:
|
||||
scores["tokenizer"] = self.nlp.tokenizer.score(examples, **self.cfg)
|
||||
else:
|
||||
scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) # type: ignore
|
||||
for name, component in self.nlp.pipeline:
|
||||
if hasattr(component, "score"):
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
if per_component:
|
||||
scores[name] = component.score(examples, **self.cfg)
|
||||
else:
|
||||
scores.update(component.score(examples, **self.cfg))
|
||||
return scores
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -291,6 +291,11 @@ def ml_tokenizer():
|
|||
return get_lang_class("ml")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ms_tokenizer():
|
||||
return get_lang_class("ms")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def nb_tokenizer():
|
||||
return get_lang_class("nb")().tokenizer
|
||||
|
|
|
@ -33,6 +33,8 @@ def test_token_morph_key(i_has):
|
|||
def test_morph_props(i_has):
|
||||
assert i_has[0].morph.get("PronType") == ["prs"]
|
||||
assert i_has[1].morph.get("PronType") == []
|
||||
assert i_has[1].morph.get("AsdfType", ["asdf"]) == ["asdf"]
|
||||
assert i_has[1].morph.get("AsdfType", default=["asdf", "qwer"]) == ["asdf", "qwer"]
|
||||
|
||||
|
||||
def test_morph_iter(i_has):
|
||||
|
|
|
@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
|
|||
assert span.text == text
|
||||
|
||||
|
||||
def test_char_span_attributes(doc):
|
||||
label = "LABEL"
|
||||
kb_id = "KB_ID"
|
||||
span_id = "SPAN_ID"
|
||||
span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
|
||||
span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
|
||||
assert span1.text == span2.text
|
||||
assert span1.label_ == span2.label_ == label
|
||||
assert span1.kb_id_ == span2.kb_id_ == kb_id
|
||||
assert span1.id_ == span2.id_ == span_id
|
||||
|
||||
|
||||
def test_spans_sent_spans(doc):
|
||||
sents = list(doc.sents)
|
||||
assert sents[0].start == 0
|
||||
|
@ -367,6 +379,14 @@ def test_spans_by_character(doc):
|
|||
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||
)
|
||||
|
||||
# Span.char_span + alignment mode "contract"
|
||||
span2 = doc[0:2].char_span(
|
||||
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
|
||||
)
|
||||
assert span1.start_char == span2.start_char
|
||||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
|
@ -680,3 +700,34 @@ def test_span_group_copy(doc):
|
|||
assert len(doc.spans["test"]) == 3
|
||||
# check that the copy spans were not modified and this is an isolated doc
|
||||
assert len(doc_copy.spans["test"]) == 2
|
||||
|
||||
|
||||
def test_for_partial_ent_sents():
|
||||
"""Spans may be associated with multiple sentences. These .sents should always be complete, not partial, sentences,
|
||||
which this tests for.
|
||||
"""
|
||||
doc = Doc(
|
||||
English().vocab,
|
||||
words=["Mahler's", "Symphony", "No.", "8", "was", "beautiful."],
|
||||
sent_starts=[1, 0, 0, 1, 0, 0],
|
||||
)
|
||||
doc.set_ents([Span(doc, 1, 4, "WORK")])
|
||||
# The specified entity is associated with both sentences in this doc, so we expect all sentences in the doc to be
|
||||
# equal to the sentences referenced in ent.sents.
|
||||
for doc_sent, ent_sent in zip(doc.sents, doc.ents[0].sents):
|
||||
assert doc_sent == ent_sent
|
||||
|
||||
|
||||
def test_for_no_ent_sents():
|
||||
"""Span.sents() should set .sents correctly, even if Span in question is trailing and doesn't form a full
|
||||
sentence.
|
||||
"""
|
||||
doc = Doc(
|
||||
English().vocab,
|
||||
words=["This", "is", "a", "test.", "ENTITY"],
|
||||
sent_starts=[1, 0, 0, 0, 1],
|
||||
)
|
||||
doc.set_ents([Span(doc, 4, 5, "WORK")])
|
||||
sents = list(doc.ents[0].sents)
|
||||
assert len(sents) == 1
|
||||
assert str(sents[0]) == str(doc.ents[0].sent) == "ENTITY"
|
||||
|
|
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
52
spacy/tests/lang/la/test_noun_chunks.py
Normal file
|
@ -0,0 +1,52 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed(la_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'la' language if Doc is not parsed.
|
||||
To check this test, we're constructing a Doc
|
||||
with a new Vocab here and forcing is_parsed to 'False'
|
||||
to make sure the noun chunks don't run.
|
||||
"""
|
||||
doc = la_tokenizer("Haec est sententia.")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
||||
|
||||
|
||||
LA_NP_TEST_EXAMPLES = [
|
||||
(
|
||||
"Haec narrantur a poetis de Perseo.",
|
||||
["DET", "VERB", "ADP", "NOUN", "ADP", "PROPN", "PUNCT"],
|
||||
["nsubj:pass", "ROOT", "case", "obl", "case", "obl", "punct"],
|
||||
[1, 0, -1, -1, -3, -1, -5],
|
||||
["poetis", "Perseo"],
|
||||
),
|
||||
(
|
||||
"Perseus autem in sinu matris dormiebat.",
|
||||
["NOUN", "ADV", "ADP", "NOUN", "NOUN", "VERB", "PUNCT"],
|
||||
["nsubj", "discourse", "case", "obl", "nmod", "ROOT", "punct"],
|
||||
[5, 4, 3, -1, -1, 0, -1],
|
||||
["Perseus", "sinu matris"],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,pos,deps,heads,expected_noun_chunks", LA_NP_TEST_EXAMPLES
|
||||
)
|
||||
def test_la_noun_chunks(la_tokenizer, text, pos, deps, heads, expected_noun_chunks):
|
||||
tokens = la_tokenizer(text)
|
||||
|
||||
assert len(heads) == len(pos)
|
||||
doc = Doc(
|
||||
tokens.vocab,
|
||||
words=[t.text for t in tokens],
|
||||
heads=[head + i for i, head in enumerate(heads)],
|
||||
deps=deps,
|
||||
pos=pos,
|
||||
)
|
||||
|
||||
noun_chunks = list(doc.noun_chunks)
|
||||
assert len(noun_chunks) == len(expected_noun_chunks)
|
||||
for i, np in enumerate(noun_chunks):
|
||||
assert np.text == expected_noun_chunks[i]
|
0
spacy/tests/lang/ms/__init__.py
Normal file
0
spacy/tests/lang/ms/__init__.py
Normal file
8
spacy/tests/lang/ms/test_noun_chunks.py
Normal file
8
spacy/tests/lang/ms/test_noun_chunks.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
|
||||
|
||||
def test_noun_chunks_is_parsed_ms(ms_tokenizer):
|
||||
"""Test that noun_chunks raises Value Error for 'ms' language if Doc is not parsed."""
|
||||
doc = ms_tokenizer("sebelas")
|
||||
with pytest.raises(ValueError):
|
||||
list(doc.noun_chunks)
|
112
spacy/tests/lang/ms/test_prefix_suffix_infix.py
Normal file
112
spacy/tests/lang/ms/test_prefix_suffix_infix.py
Normal file
|
@ -0,0 +1,112 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||
def test_ms_tokenizer_splits_no_special(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Ma'arif"])
|
||||
def test_ms_tokenizer_splits_no_punct(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif"])
|
||||
def test_ms_tokenizer_splits_prefix_punct(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Ma'arif)"])
|
||||
def test_ms_tokenizer_splits_suffix_punct(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||
def test_ms_tokenizer_splits_even_wrap(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(Ma'arif?)"])
|
||||
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
||||
def test_ms_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["S.Kom.)"])
|
||||
def test_ms_tokenizer_splits_suffix_interact(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(S.Kom.)"])
|
||||
def test_ms_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["(S.Kom.?)"])
|
||||
def test_ms_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,length",
|
||||
[("kerana", 1), ("Mahathir-Anwar", 3), ("Tun Dr. Ismail-Abdul Rahman", 6)],
|
||||
)
|
||||
def test_my_tokenizer_splits_hyphens(ms_tokenizer, text, length):
|
||||
tokens = ms_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||
def test_ms_tokenizer_splits_numeric_range(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["ini.Sani", "Halo.Malaysia"])
|
||||
def test_ms_tokenizer_splits_period_infix(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["Halo,Malaysia", "satu,dua"])
|
||||
def test_ms_tokenizer_splits_comma_infix(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text.split(",")[0]
|
||||
assert tokens[1].text == ","
|
||||
assert tokens[2].text == text.split(",")[1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", ["halo...Malaysia", "dia...pergi"])
|
||||
def test_ms_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
|
||||
tokens = id_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_ms_tokenizer_splits_double_hyphen_infix(id_tokenizer):
|
||||
tokens = id_tokenizer("Arsene Wenger--pengurus Arsenal--mengadakan sidang media.")
|
||||
assert len(tokens) == 10
|
||||
assert tokens[0].text == "Arsene"
|
||||
assert tokens[1].text == "Wenger"
|
||||
assert tokens[2].text == "--"
|
||||
assert tokens[3].text == "pengurus"
|
||||
assert tokens[4].text == "Arsenal"
|
||||
assert tokens[5].text == "--"
|
||||
assert tokens[6].text == "mengadakan"
|
||||
assert tokens[7].text == "sidang"
|
||||
assert tokens[8].text == "media"
|
||||
assert tokens[9].text == "."
|
8
spacy/tests/lang/ms/test_text.py
Normal file
8
spacy/tests/lang/ms/test_text.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
import pytest
|
||||
from spacy.lang.ms.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["sebelas"])
|
||||
def test_ms_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
|
@ -32,3 +32,10 @@ def test_tokenizer_splits_comma_infix(sv_tokenizer, text):
|
|||
def test_tokenizer_splits_ellipsis_infix(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.issue(12311)
|
||||
@pytest.mark.parametrize("text", ["99:e", "c:a", "EU:s", "Maj:t"])
|
||||
def test_sv_tokenizer_handles_colon(sv_tokenizer, text):
|
||||
tokens = sv_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches):
|
|||
("the", "brown", "$--", 0),
|
||||
("brown", "the", "$--", 1),
|
||||
("brown", "brown", "$--", 0),
|
||||
("over", "jumped", "<+", 0),
|
||||
("quick", "fox", "<+", 0),
|
||||
("the", "quick", "<+", 0),
|
||||
("brown", "fox", "<+", 1),
|
||||
("quick", "fox", "<++", 1),
|
||||
("quick", "over", "<++", 0),
|
||||
("over", "jumped", "<++", 0),
|
||||
("the", "fox", "<++", 2),
|
||||
("brown", "fox", "<-", 0),
|
||||
("fox", "over", "<-", 0),
|
||||
("the", "over", "<-", 0),
|
||||
("over", "jumped", "<-", 1),
|
||||
("brown", "fox", "<--", 0),
|
||||
("fox", "jumped", "<--", 0),
|
||||
("fox", "over", "<--", 1),
|
||||
("fox", "brown", ">+", 0),
|
||||
("over", "fox", ">+", 0),
|
||||
("over", "the", ">+", 0),
|
||||
("jumped", "over", ">+", 1),
|
||||
("jumped", "over", ">++", 1),
|
||||
("fox", "lazy", ">++", 0),
|
||||
("over", "the", ">++", 0),
|
||||
("jumped", "over", ">-", 0),
|
||||
("fox", "quick", ">-", 0),
|
||||
("brown", "quick", ">-", 0),
|
||||
("fox", "brown", ">-", 1),
|
||||
("brown", "fox", ">--", 0),
|
||||
("fox", "brown", ">--", 1),
|
||||
("jumped", "fox", ">--", 1),
|
||||
|
|
|
@ -9,6 +9,8 @@ from spacy.lang.en import English
|
|||
from spacy.lang.it import Italian
|
||||
from spacy.language import Language
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
from spacy.pipeline.ner import DEFAULT_NER_MODEL
|
||||
from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||
from spacy.tokens import Doc, Span
|
||||
|
@ -16,8 +18,6 @@ from spacy.vocab import Vocab
|
|||
import logging
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...pipeline import EntityRecognizer
|
||||
from ...pipeline.ner import DEFAULT_NER_MODEL
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
|
||||
|
|
|
@ -8,11 +8,11 @@ from spacy.lang.en import English
|
|||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import DependencyParser
|
||||
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
from ..util import apply_transition_sequence, make_tempdir
|
||||
from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from typing import Callable, Iterable, Dict, Any
|
||||
from typing import Callable, Iterable, Dict, Any, Tuple
|
||||
|
||||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
|
||||
from spacy import registry, util
|
||||
from spacy import registry, util, Language
|
||||
from spacy.attrs import ENT_KB_ID
|
||||
from spacy.compat import pickle
|
||||
from spacy.kb import Candidate, InMemoryLookupKB, get_candidates, KnowledgeBase
|
||||
|
@ -108,18 +108,23 @@ def test_issue7065():
|
|||
|
||||
|
||||
@pytest.mark.issue(7065)
|
||||
def test_issue7065_b():
|
||||
@pytest.mark.parametrize("entity_in_first_sentence", [True, False])
|
||||
def test_sentence_crossing_ents(entity_in_first_sentence: bool):
|
||||
"""Tests if NEL crashes if entities cross sentence boundaries and the first associated sentence doesn't have an
|
||||
entity.
|
||||
entity_in_prior_sentence (bool): Whether to include an entity in the first sentence associated with the
|
||||
sentence-crossing entity.
|
||||
"""
|
||||
# Test that the NEL doesn't crash when an entity crosses a sentence boundary
|
||||
nlp = English()
|
||||
vector_length = 3
|
||||
nlp.add_pipe("sentencizer")
|
||||
text = "Mahler 's Symphony No. 8 was beautiful."
|
||||
entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
|
||||
links = {
|
||||
(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
|
||||
(10, 24): {"Q7304": 0.0, "Q270853": 1.0},
|
||||
}
|
||||
sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
|
||||
entities = [(10, 24, "WORK")]
|
||||
links = {(10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
|
||||
if entity_in_first_sentence:
|
||||
entities.append((0, 6, "PERSON"))
|
||||
links[(0, 6)] = {"Q7304": 1.0, "Q270853": 0.0}
|
||||
sent_starts = [1, -1, 0, 0, 0, 1, 0, 0, 0]
|
||||
doc = nlp(text)
|
||||
example = Example.from_dict(
|
||||
doc, {"entities": entities, "links": links, "sent_starts": sent_starts}
|
||||
|
@ -145,31 +150,14 @@ def test_issue7065_b():
|
|||
|
||||
# Create the Entity Linker component and add it to the pipeline
|
||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
||||
entity_linker.set_kb(create_kb)
|
||||
entity_linker.set_kb(create_kb) # type: ignore
|
||||
# train the NEL pipe
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
||||
# Add a custom rule-based component to mimick NER
|
||||
patterns = [
|
||||
{"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
|
||||
{
|
||||
"label": "WORK",
|
||||
"pattern": [
|
||||
{"LOWER": "symphony"},
|
||||
{"LOWER": "no"},
|
||||
{"LOWER": "."},
|
||||
{"LOWER": "8"},
|
||||
],
|
||||
},
|
||||
]
|
||||
ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
|
||||
ruler.add_patterns(patterns)
|
||||
# test the trained model - this should not throw E148
|
||||
doc = nlp(text)
|
||||
assert doc
|
||||
# This shouldn't crash.
|
||||
entity_linker.predict([example.reference]) # type: ignore
|
||||
|
||||
|
||||
def test_no_entities():
|
||||
|
@ -353,6 +341,9 @@ def test_kb_default(nlp):
|
|||
"""Test that the default (empty) KB is loaded upon construction"""
|
||||
entity_linker = nlp.add_pipe("entity_linker", config={})
|
||||
assert len(entity_linker.kb) == 0
|
||||
with pytest.raises(ValueError, match="E139"):
|
||||
# this raises an error because the KB is empty
|
||||
entity_linker.validate_kb()
|
||||
assert entity_linker.kb.get_size_entities() == 0
|
||||
assert entity_linker.kb.get_size_aliases() == 0
|
||||
# 64 is the default value from pipeline.entity_linker
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
|
||||
from thinc.api import get_current_ops
|
||||
|
||||
from spacy import util
|
||||
from spacy.training import Example
|
||||
|
@ -19,6 +21,8 @@ def test_label_types():
|
|||
morphologizer.add_label(9)
|
||||
|
||||
|
||||
TAGS = ["Feat=N", "Feat=V", "Feat=J"]
|
||||
|
||||
TRAIN_DATA = [
|
||||
(
|
||||
"I like green eggs",
|
||||
|
@ -32,6 +36,30 @@ TRAIN_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_label_smoothing():
|
||||
nlp = Language()
|
||||
morph_no_ls = nlp.add_pipe("morphologizer", "no_label_smoothing")
|
||||
morph_ls = nlp.add_pipe(
|
||||
"morphologizer", "label_smoothing", config=dict(label_smoothing=0.05)
|
||||
)
|
||||
train_examples = []
|
||||
losses = {}
|
||||
for tag in TAGS:
|
||||
morph_no_ls.add_label(tag)
|
||||
morph_ls.add_label(tag)
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
tag_scores, bp_tag_scores = morph_ls.model.begin_update(
|
||||
[eg.predicted for eg in train_examples]
|
||||
)
|
||||
ops = get_current_ops()
|
||||
no_ls_grads = ops.to_numpy(morph_no_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||
ls_grads = ops.to_numpy(morph_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||
assert_almost_equal(ls_grads / no_ls_grads, 0.94285715)
|
||||
|
||||
|
||||
def test_no_label():
|
||||
nlp = Language()
|
||||
nlp.add_pipe("morphologizer")
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
import numpy
|
||||
from numpy.testing import assert_array_equal, assert_almost_equal
|
||||
from thinc.api import get_current_ops, Ragged
|
||||
from thinc.api import get_current_ops, NumpyOps, Ragged
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
|
@ -15,6 +15,8 @@ OPS = get_current_ops()
|
|||
|
||||
SPAN_KEY = "labeled_spans"
|
||||
|
||||
SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"]
|
||||
|
||||
TRAIN_DATA = [
|
||||
("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}),
|
||||
(
|
||||
|
@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA):
|
|||
return train_examples
|
||||
|
||||
|
||||
def test_no_label():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_label(name):
|
||||
nlp = Language()
|
||||
nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
with pytest.raises(ValueError):
|
||||
nlp.initialize()
|
||||
|
||||
|
||||
def test_no_resize():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_no_resize(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
spancat.add_label("Thing")
|
||||
spancat.add_label("Phrase")
|
||||
assert spancat.labels == ("Thing", "Phrase")
|
||||
nlp.initialize()
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
# this throws an error because the spancat can't be resized after initialization
|
||||
with pytest.raises(ValueError):
|
||||
spancat.add_label("Stuff")
|
||||
|
||||
|
||||
def test_implicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_implicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.labels == ("PERSON", "LOC")
|
||||
|
||||
|
||||
def test_explicit_labels():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_explicit_labels(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
assert len(spancat.labels) == 0
|
||||
spancat.add_label("PERSON")
|
||||
spancat.add_label("LOC")
|
||||
|
@ -102,13 +108,13 @@ def test_doc_gc():
|
|||
# XXX This fails with length 0 sometimes
|
||||
assert len(spangroup) > 0
|
||||
with pytest.raises(RuntimeError):
|
||||
span = spangroup[0]
|
||||
spangroup[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)]
|
||||
)
|
||||
def test_make_spangroup(max_positive, nr_results):
|
||||
def test_make_spangroup_multilabel(max_positive, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
|
@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group(doc, indices, scores, labels)
|
||||
spangroup = spancat._make_span_group_multilabel(doc, indices, scores)
|
||||
assert len(spangroup) == nr_results
|
||||
|
||||
# first span is always the second token "London"
|
||||
|
@ -154,6 +162,130 @@ def test_make_spangroup(max_positive, nr_results):
|
|||
assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"threshold,allow_overlap,nr_results",
|
||||
[(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)],
|
||||
)
|
||||
def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results):
|
||||
fix_random_seed(0)
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": threshold,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
doc = nlp.make_doc("Greater London")
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat.add_label(label)
|
||||
scores = numpy.asarray(
|
||||
[[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f"
|
||||
)
|
||||
spangroup = spancat._make_span_group_singlelabel(
|
||||
doc, indices, scores, allow_overlap
|
||||
)
|
||||
if threshold > 0.4:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "London"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert_almost_equal(0.6, spangroup.attrs["scores"][0], 5)
|
||||
assert spangroup[1].text == "Greater London"
|
||||
assert spangroup[1].label_ == "GreatCity"
|
||||
assert spangroup.attrs["scores"][1] == 0.9
|
||||
assert_almost_equal(0.9, spangroup.attrs["scores"][1], 5)
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
assert spangroup[0].label_ == "GreatCity"
|
||||
assert spangroup.attrs["scores"][0] == 0.9
|
||||
else:
|
||||
if allow_overlap:
|
||||
assert spangroup[0].text == "Greater"
|
||||
assert spangroup[0].label_ == "City"
|
||||
assert spangroup[1].text == "London"
|
||||
assert spangroup[1].label_ == "City"
|
||||
assert spangroup[2].text == "Greater London"
|
||||
assert spangroup[2].label_ == "GreatCity"
|
||||
else:
|
||||
assert spangroup[0].text == "Greater London"
|
||||
|
||||
|
||||
def test_make_spangroup_negative_label():
|
||||
fix_random_seed(0)
|
||||
nlp_single = Language()
|
||||
nlp_multi = Language()
|
||||
spancat_single = nlp_single.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 1,
|
||||
},
|
||||
)
|
||||
spancat_multi = nlp_multi.add_pipe(
|
||||
"spancat",
|
||||
config={
|
||||
"spans_key": SPAN_KEY,
|
||||
"threshold": 0.1,
|
||||
"max_positive": 2,
|
||||
},
|
||||
)
|
||||
spancat_single.add_negative_label = True
|
||||
spancat_multi.add_negative_label = True
|
||||
doc = nlp_single.make_doc("Greater London")
|
||||
labels = ["Thing", "City", "Person", "GreatCity"]
|
||||
for label in labels:
|
||||
spancat_multi.add_label(label)
|
||||
spancat_single.add_label(label)
|
||||
ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2])
|
||||
indices = ngram_suggester([doc])[0].dataXd
|
||||
assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]]))
|
||||
scores = numpy.asarray(
|
||||
[
|
||||
[0.2, 0.4, 0.3, 0.1, 0.1],
|
||||
[0.1, 0.6, 0.2, 0.4, 0.9],
|
||||
[0.8, 0.7, 0.3, 0.9, 0.1],
|
||||
],
|
||||
dtype="f",
|
||||
)
|
||||
spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores)
|
||||
spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores)
|
||||
assert len(spangroup_single) == 2
|
||||
assert spangroup_single[0].text == "Greater"
|
||||
assert spangroup_single[0].label_ == "City"
|
||||
assert_almost_equal(0.4, spangroup_single.attrs["scores"][0], 5)
|
||||
assert spangroup_single[1].text == "Greater London"
|
||||
assert spangroup_single[1].label_ == "GreatCity"
|
||||
assert spangroup_single.attrs["scores"][1] == 0.9
|
||||
assert_almost_equal(0.9, spangroup_single.attrs["scores"][1], 5)
|
||||
|
||||
assert len(spangroup_multi) == 6
|
||||
assert spangroup_multi[0].text == "Greater"
|
||||
assert spangroup_multi[0].label_ == "City"
|
||||
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][0], 5)
|
||||
assert spangroup_multi[1].text == "Greater"
|
||||
assert spangroup_multi[1].label_ == "Person"
|
||||
assert_almost_equal(0.3, spangroup_multi.attrs["scores"][1], 5)
|
||||
assert spangroup_multi[2].text == "London"
|
||||
assert spangroup_multi[2].label_ == "City"
|
||||
assert_almost_equal(0.6, spangroup_multi.attrs["scores"][2], 5)
|
||||
assert spangroup_multi[3].text == "London"
|
||||
assert spangroup_multi[3].label_ == "GreatCity"
|
||||
assert_almost_equal(0.4, spangroup_multi.attrs["scores"][3], 5)
|
||||
assert spangroup_multi[4].text == "Greater London"
|
||||
assert spangroup_multi[4].label_ == "Thing"
|
||||
assert spangroup_multi[4].text == "Greater London"
|
||||
assert_almost_equal(0.8, spangroup_multi.attrs["scores"][4], 5)
|
||||
assert spangroup_multi[5].text == "Greater London"
|
||||
assert spangroup_multi[5].label_ == "GreatCity"
|
||||
assert_almost_equal(0.9, spangroup_multi.attrs["scores"][5], 5)
|
||||
|
||||
|
||||
def test_ngram_suggester(en_tokenizer):
|
||||
# test different n-gram lengths
|
||||
for size in [1, 2, 3]:
|
||||
|
@ -371,9 +503,9 @@ def test_overfitting_IO_overlapping():
|
|||
assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"}
|
||||
|
||||
|
||||
def test_zero_suggestions():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_zero_suggestions(name):
|
||||
# Test with a suggester that can return 0 suggestions
|
||||
|
||||
@registry.misc("test_mixed_zero_suggester")
|
||||
def make_mixed_zero_suggester():
|
||||
def mixed_zero_suggester(docs, *, ops=None):
|
||||
|
@ -400,7 +532,7 @@ def test_zero_suggestions():
|
|||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
spancat = nlp.add_pipe(
|
||||
"spancat",
|
||||
name,
|
||||
config={
|
||||
"suggester": {"@misc": "test_mixed_zero_suggester"},
|
||||
"spans_key": SPAN_KEY,
|
||||
|
@ -408,7 +540,7 @@ def test_zero_suggestions():
|
|||
)
|
||||
train_examples = make_examples(nlp)
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
assert spancat.model.get_dim("nO") == 2
|
||||
assert spancat.model.get_dim("nO") == spancat._n_labels
|
||||
assert set(spancat.labels) == {"LOC", "PERSON"}
|
||||
|
||||
nlp.update(train_examples, sgd=optimizer)
|
||||
|
@ -424,9 +556,10 @@ def test_zero_suggestions():
|
|||
list(nlp.pipe(["", "one", "three three three"]))
|
||||
|
||||
|
||||
def test_set_candidates():
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
def test_set_candidates(name):
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY})
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
texts = [
|
||||
|
@ -444,3 +577,21 @@ def test_set_candidates():
|
|||
assert len(docs[0].spans["candidates"]) == 9
|
||||
assert docs[0].spans["candidates"][0].text == "Just"
|
||||
assert docs[0].spans["candidates"][4].text == "Just a"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", SPANCAT_COMPONENTS)
|
||||
@pytest.mark.parametrize("n_process", [1, 2])
|
||||
def test_spancat_multiprocessing(name, n_process):
|
||||
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||
nlp = Language()
|
||||
spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY})
|
||||
train_examples = make_examples(nlp)
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
texts = [
|
||||
"Just a sentence.",
|
||||
"I like London and Berlin",
|
||||
"I like Berlin",
|
||||
"I eat ham.",
|
||||
]
|
||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||
assert len(docs) == len(texts)
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import pytest
|
||||
from numpy.testing import assert_equal
|
||||
from numpy.testing import assert_equal, assert_almost_equal
|
||||
from spacy.attrs import TAG
|
||||
|
||||
from spacy import util
|
||||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.language import Language
|
||||
from thinc.api import compounding
|
||||
from thinc.api import compounding, get_current_ops
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -67,6 +67,30 @@ PARTIAL_DATA = [
|
|||
]
|
||||
|
||||
|
||||
def test_label_smoothing():
|
||||
nlp = Language()
|
||||
tagger_no_ls = nlp.add_pipe("tagger", "no_label_smoothing")
|
||||
tagger_ls = nlp.add_pipe(
|
||||
"tagger", "label_smoothing", config=dict(label_smoothing=0.05)
|
||||
)
|
||||
train_examples = []
|
||||
losses = {}
|
||||
for tag in TAGS:
|
||||
tagger_no_ls.add_label(tag)
|
||||
tagger_ls.add_label(tag)
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
tag_scores, bp_tag_scores = tagger_ls.model.begin_update(
|
||||
[eg.predicted for eg in train_examples]
|
||||
)
|
||||
ops = get_current_ops()
|
||||
no_ls_grads = ops.to_numpy(tagger_no_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||
ls_grads = ops.to_numpy(tagger_ls.get_loss(train_examples, tag_scores)[1][0])
|
||||
assert_almost_equal(ls_grads / no_ls_grads, 0.925)
|
||||
|
||||
|
||||
def test_no_label():
|
||||
nlp = Language()
|
||||
nlp.add_pipe("tagger")
|
||||
|
|
|
@ -72,7 +72,7 @@ def entity_linker():
|
|||
|
||||
def create_kb(vocab):
|
||||
kb = InMemoryLookupKB(vocab, entity_vector_length=1)
|
||||
kb.add_entity("test", 0.0, zeros((1, 1), dtype="f"))
|
||||
kb.add_entity("test", 0.0, zeros((1,), dtype="f"))
|
||||
return kb
|
||||
|
||||
entity_linker = nlp.add_pipe("entity_linker")
|
||||
|
|
|
@ -213,6 +213,13 @@ def test_serialize_doc_exclude(en_vocab):
|
|||
|
||||
def test_serialize_doc_span_groups(en_vocab):
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||
doc.spans["content"] = [doc[0:2]]
|
||||
span = doc[0:2]
|
||||
span.label_ = "test_serialize_doc_span_groups_label"
|
||||
span.id_ = "test_serialize_doc_span_groups_id"
|
||||
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
|
||||
doc.spans["content"] = [span]
|
||||
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
||||
assert len(new_doc.spans["content"]) == 1
|
||||
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
|
||||
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
|
||||
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
|
||||
|
|
|
@ -49,7 +49,11 @@ def test_serialize_doc_bin():
|
|||
nlp = English()
|
||||
for doc in nlp.pipe(texts):
|
||||
doc.cats = cats
|
||||
doc.spans["start"] = [doc[0:2]]
|
||||
span = doc[0:2]
|
||||
span.label_ = "UNUSUAL_SPAN_LABEL"
|
||||
span.id_ = "UNUSUAL_SPAN_ID"
|
||||
span.kb_id_ = "UNUSUAL_SPAN_KB_ID"
|
||||
doc.spans["start"] = [span]
|
||||
doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
|
||||
doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
|
||||
doc_bin.add(doc)
|
||||
|
@ -63,6 +67,9 @@ def test_serialize_doc_bin():
|
|||
assert doc.text == texts[i]
|
||||
assert doc.cats == cats
|
||||
assert len(doc.spans) == 1
|
||||
assert doc.spans["start"][0].label_ == "UNUSUAL_SPAN_LABEL"
|
||||
assert doc.spans["start"][0].id_ == "UNUSUAL_SPAN_ID"
|
||||
assert doc.spans["start"][0].kb_id_ == "UNUSUAL_SPAN_KB_ID"
|
||||
assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
|
||||
assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
|
||||
|
||||
|
|
|
@ -1,7 +1,10 @@
|
|||
from typing import Callable
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Any, Dict
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import ensure_path, registry, load_model_from_config
|
||||
import srsly
|
||||
|
||||
from spacy import util, Errors
|
||||
from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList
|
||||
from spacy.kb.kb_in_memory import InMemoryLookupKB
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
|
@ -91,7 +94,10 @@ def test_serialize_subclassed_kb():
|
|||
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
|
||||
|
||||
[components.entity_linker.generate_empty_kb]
|
||||
@misc = "kb_test.CustomEmptyKB.v1"
|
||||
|
||||
[initialize]
|
||||
|
||||
[initialize.components]
|
||||
|
@ -99,7 +105,7 @@ def test_serialize_subclassed_kb():
|
|||
[initialize.components.entity_linker]
|
||||
|
||||
[initialize.components.entity_linker.kb_loader]
|
||||
@misc = "spacy.CustomKB.v1"
|
||||
@misc = "kb_test.CustomKB.v1"
|
||||
entity_vector_length = 342
|
||||
custom_field = 666
|
||||
"""
|
||||
|
@ -109,10 +115,57 @@ def test_serialize_subclassed_kb():
|
|||
super().__init__(vocab, entity_vector_length)
|
||||
self.custom_field = custom_field
|
||||
|
||||
@registry.misc("spacy.CustomKB.v1")
|
||||
def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir(parents=True)
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def serialize_custom_fields(file_path: Path) -> None:
|
||||
srsly.write_json(file_path, {"custom_field": self.custom_field})
|
||||
|
||||
serialize = {
|
||||
"contents": lambda p: self.write_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.to_disk(p),
|
||||
"custom_fields": lambda p: serialize_custom_fields(p),
|
||||
}
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()):
|
||||
"""We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well."""
|
||||
path = ensure_path(path)
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
if not path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
|
||||
def deserialize_custom_fields(file_path: Path) -> None:
|
||||
self.custom_field = srsly.read_json(file_path)["custom_field"]
|
||||
|
||||
deserialize: Dict[str, Callable[[Any], Any]] = {
|
||||
"contents": lambda p: self.read_contents(p),
|
||||
"strings.json": lambda p: self.vocab.strings.from_disk(p),
|
||||
"custom_fields": lambda p: deserialize_custom_fields(p),
|
||||
}
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
|
||||
@registry.misc("kb_test.CustomEmptyKB.v1")
|
||||
def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]:
|
||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||
return SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
entity_vector_length=entity_vector_length,
|
||||
custom_field=0,
|
||||
)
|
||||
|
||||
return empty_kb_factory
|
||||
|
||||
@registry.misc("kb_test.CustomKB.v1")
|
||||
def custom_kb(
|
||||
entity_vector_length: int, custom_field: int
|
||||
) -> Callable[[Vocab], InMemoryLookupKB]:
|
||||
) -> Callable[[Vocab], SubInMemoryLookupKB]:
|
||||
def custom_kb_factory(vocab):
|
||||
kb = SubInMemoryLookupKB(
|
||||
vocab=vocab,
|
||||
|
@ -139,6 +192,6 @@ def test_serialize_subclassed_kb():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
entity_linker2 = nlp2.get_pipe("entity_linker")
|
||||
# After IO, the KB is the standard one
|
||||
assert type(entity_linker2.kb) == InMemoryLookupKB
|
||||
assert type(entity_linker2.kb) == SubInMemoryLookupKB
|
||||
assert entity_linker2.kb.entity_vector_length == 342
|
||||
assert not hasattr(entity_linker2.kb, "custom_field")
|
||||
assert entity_linker2.kb.custom_field == 666
|
||||
|
|
|
@ -2,7 +2,6 @@ import os
|
|||
import math
|
||||
from collections import Counter
|
||||
from typing import Tuple, List, Dict, Any
|
||||
import pkg_resources
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -13,6 +12,7 @@ import srsly
|
|||
from click import NoSuchOption
|
||||
from packaging.specifiers import SpecifierSet
|
||||
from thinc.api import Config, ConfigValidationError
|
||||
from spacy.tokens import DocBin
|
||||
|
||||
from spacy import about
|
||||
from spacy.cli import info
|
||||
|
@ -28,7 +28,9 @@ from spacy.cli.debug_data import _get_span_characteristics
|
|||
from spacy.cli.debug_data import _print_span_characteristics
|
||||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.evaluate import render_parses
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.init_pipeline import _init_labels
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
|
@ -47,7 +49,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
|||
from spacy.training.converters import iob_to_docs
|
||||
from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config
|
||||
|
||||
from ..cli.init_pipeline import _init_labels
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
|
@ -145,6 +146,70 @@ def test_issue11235():
|
|||
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
|
||||
|
||||
|
||||
@pytest.mark.issue(12566)
|
||||
@pytest.mark.parametrize(
|
||||
"factory,output_file",
|
||||
[("deps", "parses.html"), ("ents", "entities.html"), ("spans", "spans.html")],
|
||||
)
|
||||
def test_issue12566(factory: str, output_file: str):
|
||||
"""
|
||||
Test if all displaCy types (ents, dep, spans) produce an HTML file
|
||||
"""
|
||||
with make_tempdir() as tmp_dir:
|
||||
# Create sample spaCy file
|
||||
doc_json = {
|
||||
"ents": [
|
||||
{"end": 54, "label": "nam_adj_country", "start": 44},
|
||||
{"end": 83, "label": "nam_liv_person", "start": 69},
|
||||
{"end": 100, "label": "nam_pro_title_book", "start": 86},
|
||||
],
|
||||
"spans": {
|
||||
"sc": [
|
||||
{"end": 54, "kb_id": "", "label": "nam_adj_country", "start": 44},
|
||||
{"end": 83, "kb_id": "", "label": "nam_liv_person", "start": 69},
|
||||
{
|
||||
"end": 100,
|
||||
"kb_id": "",
|
||||
"label": "nam_pro_title_book",
|
||||
"start": 86,
|
||||
},
|
||||
]
|
||||
},
|
||||
"text": "Niedawno czytał em nową książkę znakomitego szkockiego medioznawcy , "
|
||||
"Briana McNaira - Cultural Chaos .",
|
||||
"tokens": [
|
||||
# fmt: off
|
||||
{"id": 0, "start": 0, "end": 8, "tag": "ADV", "pos": "ADV", "morph": "Degree=Pos", "lemma": "niedawno", "dep": "advmod", "head": 1, },
|
||||
{"id": 1, "start": 9, "end": 15, "tag": "PRAET", "pos": "VERB", "morph": "Animacy=Hum|Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", "lemma": "czytać", "dep": "ROOT", "head": 1, },
|
||||
{"id": 2, "start": 16, "end": 18, "tag": "AGLT", "pos": "NOUN", "morph": "Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing", "lemma": "em", "dep": "iobj", "head": 1, },
|
||||
{"id": 3, "start": 19, "end": 23, "tag": "ADJ", "pos": "ADJ", "morph": "Case=Acc|Degree=Pos|Gender=Fem|Number=Sing", "lemma": "nowy", "dep": "amod", "head": 4, },
|
||||
{"id": 4, "start": 24, "end": 31, "tag": "SUBST", "pos": "NOUN", "morph": "Case=Acc|Gender=Fem|Number=Sing", "lemma": "książka", "dep": "obj", "head": 1, },
|
||||
{"id": 5, "start": 32, "end": 43, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Nhum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "znakomit", "dep": "acl", "head": 4, },
|
||||
{"id": 6, "start": 44, "end": 54, "tag": "ADJ", "pos": "ADJ", "morph": "Animacy=Hum|Case=Gen|Degree=Pos|Gender=Masc|Number=Sing", "lemma": "szkockiy", "dep": "amod", "head": 7, },
|
||||
{"id": 7, "start": 55, "end": 66, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "medioznawca", "dep": "iobj", "head": 5, },
|
||||
{"id": 8, "start": 67, "end": 68, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Comm", "lemma": ",", "dep": "punct", "head": 9, },
|
||||
{"id": 9, "start": 69, "end": 75, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "Brian", "dep": "nmod", "head": 4, },
|
||||
{"id": 10, "start": 76, "end": 83, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Hum|Case=Gen|Gender=Masc|Number=Sing", "lemma": "McNair", "dep": "flat", "head": 9, },
|
||||
{"id": 11, "start": 84, "end": 85, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Dash", "lemma": "-", "dep": "punct", "head": 12, },
|
||||
{"id": 12, "start": 86, "end": 94, "tag": "SUBST", "pos": "PROPN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Cultural", "dep": "conj", "head": 4, },
|
||||
{"id": 13, "start": 95, "end": 100, "tag": "SUBST", "pos": "NOUN", "morph": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing", "lemma": "Chaos", "dep": "flat", "head": 12, },
|
||||
{"id": 14, "start": 101, "end": 102, "tag": "INTERP", "pos": "PUNCT", "morph": "PunctType=Peri", "lemma": ".", "dep": "punct", "head": 1, },
|
||||
# fmt: on
|
||||
],
|
||||
}
|
||||
|
||||
# Create a .spacy file
|
||||
nlp = spacy.blank("pl")
|
||||
doc = Doc(nlp.vocab).from_json(doc_json)
|
||||
|
||||
# Run the evaluate command and check if the html files exist
|
||||
render_parses(
|
||||
docs=[doc], output_path=tmp_dir, model_name="", limit=1, **{factory: True}
|
||||
)
|
||||
|
||||
assert (tmp_dir / output_file).is_file()
|
||||
|
||||
|
||||
def test_cli_info():
|
||||
nlp = Dutch()
|
||||
nlp.add_pipe("textcat")
|
||||
|
@ -553,7 +618,14 @@ def test_parse_cli_overrides():
|
|||
|
||||
@pytest.mark.parametrize("lang", ["en", "nl"])
|
||||
@pytest.mark.parametrize(
|
||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||
"pipeline",
|
||||
[
|
||||
["tagger", "parser", "ner"],
|
||||
[],
|
||||
["ner", "textcat", "sentencizer"],
|
||||
["morphologizer", "spancat", "entity_linker"],
|
||||
["spancat_singlelabel", "textcat_multilabel"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||
@pytest.mark.parametrize("pretraining", [True, False])
|
||||
|
@ -1017,8 +1089,6 @@ def test_local_remote_storage_pull_missing():
|
|||
|
||||
|
||||
def test_cli_find_threshold(capsys):
|
||||
thresholds = numpy.linspace(0, 1, 10)
|
||||
|
||||
def make_examples(nlp: Language) -> List[Example]:
|
||||
docs: List[Example] = []
|
||||
|
||||
|
@ -1082,8 +1152,6 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="cats_macro_f",
|
||||
silent=True,
|
||||
)
|
||||
assert best_threshold != thresholds[0]
|
||||
assert thresholds[0] < best_threshold < thresholds[9]
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
|
@ -1091,7 +1159,7 @@ def test_cli_find_threshold(capsys):
|
|||
nlp, _ = init_nlp((("spancat", {}),))
|
||||
with make_tempdir() as nlp_dir:
|
||||
nlp.to_disk(nlp_dir)
|
||||
res = find_threshold(
|
||||
best_threshold, best_score, res = find_threshold(
|
||||
model=nlp_dir,
|
||||
data_path=docs_dir / "docs.spacy",
|
||||
pipe_name="spancat",
|
||||
|
@ -1099,10 +1167,8 @@ def test_cli_find_threshold(capsys):
|
|||
scores_key="spans_sc_f",
|
||||
silent=True,
|
||||
)
|
||||
assert res[0] != thresholds[0]
|
||||
assert thresholds[0] < res[0] < thresholds[8]
|
||||
assert res[1] >= 0.6
|
||||
assert res[2][1.0] == 0.0
|
||||
assert best_score == max(res.values())
|
||||
assert res[1.0] == 0.0
|
||||
|
||||
# Having multiple textcat_multilabel components should work, since the name has to be specified.
|
||||
nlp, _ = init_nlp((("textcat_multilabel", {}),))
|
||||
|
@ -1132,6 +1198,7 @@ def test_cli_find_threshold(capsys):
|
|||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||
@pytest.mark.parametrize(
|
||||
"reqs,output",
|
||||
[
|
||||
|
@ -1164,6 +1231,8 @@ def test_cli_find_threshold(capsys):
|
|||
],
|
||||
)
|
||||
def test_project_check_requirements(reqs, output):
|
||||
import pkg_resources
|
||||
|
||||
# excessive guard against unlikely package name
|
||||
try:
|
||||
pkg_resources.require("spacyunknowndoesnotexist12345")
|
||||
|
@ -1207,3 +1276,69 @@ def test_walk_directory():
|
|||
assert (len(walk_directory(d, suffix="iob"))) == 2
|
||||
assert (len(walk_directory(d, suffix="conll"))) == 3
|
||||
assert (len(walk_directory(d, suffix="pdf"))) == 0
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_basic():
|
||||
examples = [
|
||||
("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
|
||||
("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
# ref test_edit_tree_lemmatizer::test_initialize_from_labels
|
||||
# this results in 4 trees
|
||||
assert len(data["lemmatizer_trees"]) == 4
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_partial():
|
||||
partial_examples = [
|
||||
# partial annotation
|
||||
("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
|
||||
# misaligned partial annotation
|
||||
(
|
||||
"He hates green eggs",
|
||||
{
|
||||
"words": ["He", "hat", "es", "green", "eggs"],
|
||||
"lemmas": ["", "hat", "e", "green", ""],
|
||||
},
|
||||
),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in partial_examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
assert data["partial_lemma_annotations"] == 2
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_low_cardinality():
|
||||
low_cardinality_examples = [
|
||||
("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
|
||||
("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in low_cardinality_examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
assert data["n_low_cardinality_lemmas"] == 2
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_not_annotated():
|
||||
unannotated_examples = [
|
||||
("She likes green eggs", {}),
|
||||
("Eat blue ham", {}),
|
||||
]
|
||||
nlp = Language()
|
||||
train_examples = []
|
||||
for t in unannotated_examples:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
|
||||
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
|
||||
assert data["no_lemma_annotations"] == 2
|
||||
|
|
|
@ -1,13 +1,21 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import srsly
|
||||
from typer.testing import CliRunner
|
||||
from spacy.tokens import DocBin, Doc, Span
|
||||
from spacy.lang.en import English
|
||||
|
||||
from spacy.cli._util import app
|
||||
from .util import make_tempdir
|
||||
from spacy.cli._util import app, get_git_version
|
||||
from .util import make_tempdir, normalize_whitespace
|
||||
|
||||
|
||||
def has_git():
|
||||
try:
|
||||
get_git_version()
|
||||
return True
|
||||
except RuntimeError:
|
||||
return False
|
||||
|
||||
|
||||
def test_convert_auto():
|
||||
|
@ -41,11 +49,195 @@ def test_benchmark_accuracy_alias():
|
|||
# Verify that the `evaluate` alias works correctly.
|
||||
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
||||
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
||||
assert result_benchmark.stdout == result_evaluate.stdout.replace(
|
||||
"spacy evaluate", "spacy benchmark accuracy"
|
||||
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
||||
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
||||
)
|
||||
|
||||
|
||||
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
||||
train_docs = [
|
||||
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
|
||||
Doc(
|
||||
en_vocab,
|
||||
words=["Dogs", "are", "great", "too"],
|
||||
lemmas=["dog", "be", "great", "too"],
|
||||
),
|
||||
]
|
||||
dev_docs = [
|
||||
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
|
||||
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
|
||||
]
|
||||
with make_tempdir() as d_in:
|
||||
train_bin = DocBin(docs=train_docs)
|
||||
train_bin.to_disk(d_in / "train.spacy")
|
||||
dev_bin = DocBin(docs=dev_docs)
|
||||
dev_bin.to_disk(d_in / "dev.spacy")
|
||||
# `debug data` requires an input pipeline config
|
||||
CliRunner().invoke(
|
||||
app,
|
||||
[
|
||||
"init",
|
||||
"config",
|
||||
f"{d_in}/config.cfg",
|
||||
"--lang",
|
||||
"en",
|
||||
"--pipeline",
|
||||
"trainable_lemmatizer",
|
||||
],
|
||||
)
|
||||
result_debug_data = CliRunner().invoke(
|
||||
app,
|
||||
[
|
||||
"debug",
|
||||
"data",
|
||||
f"{d_in}/config.cfg",
|
||||
"--paths.train",
|
||||
f"{d_in}/train.spacy",
|
||||
"--paths.dev",
|
||||
f"{d_in}/dev.spacy",
|
||||
],
|
||||
)
|
||||
# Instead of checking specific wording of the output, which may change,
|
||||
# we'll check that this section of the debug output is present.
|
||||
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
||||
|
||||
|
||||
# project tests
|
||||
|
||||
SAMPLE_PROJECT = {
|
||||
"title": "Sample project",
|
||||
"description": "This is a project for testing",
|
||||
"assets": [
|
||||
{
|
||||
"dest": "assets/spacy-readme.md",
|
||||
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
|
||||
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
|
||||
},
|
||||
{
|
||||
"dest": "assets/citation.cff",
|
||||
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
|
||||
"checksum": "c996bfd80202d480eb2e592369714e5e",
|
||||
"extra": True,
|
||||
},
|
||||
],
|
||||
"commands": [
|
||||
{
|
||||
"name": "ok",
|
||||
"help": "print ok",
|
||||
"script": ["python -c \"print('okokok')\""],
|
||||
},
|
||||
{
|
||||
"name": "create",
|
||||
"help": "make a file",
|
||||
"script": ["touch abc.txt"],
|
||||
"outputs": ["abc.txt"],
|
||||
},
|
||||
{
|
||||
"name": "clean",
|
||||
"help": "remove test file",
|
||||
"script": ["rm abc.txt"],
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def project_dir():
|
||||
with make_tempdir() as pdir:
|
||||
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
|
||||
yield pdir
|
||||
|
||||
|
||||
def test_project_document(project_dir):
|
||||
readme_path = project_dir / "README.md"
|
||||
assert not readme_path.exists(), "README already exists"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert readme_path.is_file()
|
||||
text = readme_path.read_text("utf-8")
|
||||
assert SAMPLE_PROJECT["description"] in text
|
||||
|
||||
|
||||
def test_project_assets(project_dir):
|
||||
asset_dir = project_dir / "assets"
|
||||
assert not asset_dir.exists(), "Assets dir is already present"
|
||||
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
|
||||
# check that extras work
|
||||
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
|
||||
|
||||
|
||||
def test_project_run(project_dir):
|
||||
# make sure dry run works
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "run", "--dry", "create", str(project_dir)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert "okokok" in result.stdout
|
||||
|
||||
|
||||
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
||||
@pytest.mark.parametrize(
|
||||
"options",
|
||||
[
|
||||
"",
|
||||
# "--sparse",
|
||||
"--branch v3",
|
||||
"--repo https://github.com/explosion/projects --branch v3",
|
||||
],
|
||||
)
|
||||
def test_project_clone(options):
|
||||
with make_tempdir() as workspace:
|
||||
out = workspace / "project"
|
||||
target = "benchmarks/ner_conll03"
|
||||
if not options:
|
||||
options = []
|
||||
else:
|
||||
options = options.split()
|
||||
result = CliRunner().invoke(
|
||||
app, ["project", "clone", target, *options, str(out)]
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
assert (out / "README.md").is_file()
|
||||
|
||||
|
||||
def test_project_push_pull(project_dir):
|
||||
proj = dict(SAMPLE_PROJECT)
|
||||
remote = "xyz"
|
||||
|
||||
with make_tempdir() as remote_dir:
|
||||
proj["remotes"] = {remote: str(remote_dir)}
|
||||
proj_text = srsly.yaml_dumps(proj)
|
||||
(project_dir / "project.yml").write_text(proj_text)
|
||||
|
||||
test_file = project_dir / "abc.txt"
|
||||
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
result = CliRunner().invoke(app, ["project", "run", "clean", str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert not test_file.exists()
|
||||
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
||||
assert result.exit_code == 0
|
||||
assert test_file.is_file()
|
||||
|
||||
|
||||
example_words_1 = ["I", "like", "cats"]
|
||||
example_words_2 = ["I", "like", "dogs"]
|
||||
example_lemmas_1 = ["I", "like", "cat"]
|
||||
|
|
|
@ -275,6 +275,20 @@ def test_displacy_parse_deps(en_vocab):
|
|||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||
]
|
||||
# Test that displacy.parse_deps converts Span to Doc
|
||||
deps = displacy.parse_deps(doc[:])
|
||||
assert isinstance(deps, dict)
|
||||
assert deps["words"] == [
|
||||
{"lemma": None, "text": words[0], "tag": pos[0]},
|
||||
{"lemma": None, "text": words[1], "tag": pos[1]},
|
||||
{"lemma": None, "text": words[2], "tag": pos[2]},
|
||||
{"lemma": None, "text": words[3], "tag": pos[3]},
|
||||
]
|
||||
assert deps["arcs"] == [
|
||||
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||
]
|
||||
|
||||
|
||||
def test_displacy_invalid_arcs():
|
||||
|
|
|
@ -46,7 +46,7 @@ def assert_sents_error(doc):
|
|||
|
||||
def warn_error(proc_name, proc, docs, e):
|
||||
logger = logging.getLogger("spacy")
|
||||
logger.warning(f"Trouble with component {proc_name}.")
|
||||
logger.warning("Trouble with component %s.", proc_name)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
|
@ -115,6 +115,14 @@ def test_tokenization(sented_doc):
|
|||
assert scores["token_r"] == approx(0.33333333)
|
||||
assert scores["token_f"] == 0.4
|
||||
|
||||
# per-component scoring
|
||||
scorer = Scorer()
|
||||
scores = scorer.score([example], per_component=True)
|
||||
assert scores["tokenizer"]["token_acc"] == 0.5
|
||||
assert scores["tokenizer"]["token_p"] == 0.5
|
||||
assert scores["tokenizer"]["token_r"] == approx(0.33333333)
|
||||
assert scores["tokenizer"]["token_f"] == 0.4
|
||||
|
||||
|
||||
def test_sents(sented_doc):
|
||||
scorer = Scorer()
|
||||
|
@ -278,6 +286,13 @@ def test_tag_score(tagged_doc):
|
|||
assert results["morph_per_feat"]["Poss"]["f"] == 0.0
|
||||
assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272)
|
||||
|
||||
# per-component scoring
|
||||
scorer = Scorer()
|
||||
results = scorer.score([example], per_component=True)
|
||||
assert results["tagger"]["tag_acc"] == 0.9
|
||||
assert results["morphologizer"]["pos_acc"] == 0.9
|
||||
assert results["morphologizer"]["morph_acc"] == approx(0.8)
|
||||
|
||||
|
||||
def test_partial_annotation(en_tokenizer):
|
||||
pred_doc = en_tokenizer("a b c d e")
|
||||
|
|
78
spacy/tests/training/test_corpus.py
Normal file
78
spacy/tests/training/test_corpus.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
from typing import IO, Generator, Iterable, List, TextIO, Tuple
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
import tempfile
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.training import Example, PlainTextCorpus
|
||||
from spacy.util import make_tempdir
|
||||
|
||||
# Intentional newlines to check that they are skipped.
|
||||
PLAIN_TEXT_DOC = """
|
||||
|
||||
This is a doc. It contains two sentences.
|
||||
This is another doc.
|
||||
|
||||
A third doc.
|
||||
|
||||
"""
|
||||
|
||||
PLAIN_TEXT_DOC_TOKENIZED = [
|
||||
[
|
||||
"This",
|
||||
"is",
|
||||
"a",
|
||||
"doc",
|
||||
".",
|
||||
"It",
|
||||
"contains",
|
||||
"two",
|
||||
"sentences",
|
||||
".",
|
||||
],
|
||||
["This", "is", "another", "doc", "."],
|
||||
["A", "third", "doc", "."],
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("min_length", [0, 5])
|
||||
@pytest.mark.parametrize("max_length", [0, 5])
|
||||
def test_plain_text_reader(min_length, max_length):
|
||||
nlp = English()
|
||||
with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path:
|
||||
corpus = PlainTextCorpus(
|
||||
file_path, min_length=min_length, max_length=max_length
|
||||
)
|
||||
|
||||
check = [
|
||||
doc
|
||||
for doc in PLAIN_TEXT_DOC_TOKENIZED
|
||||
if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
|
||||
]
|
||||
reference, predicted = _examples_to_tokens(corpus(nlp))
|
||||
|
||||
assert reference == check
|
||||
assert predicted == check
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _string_to_tmp_file(s: str) -> Generator[Path, None, None]:
|
||||
with make_tempdir() as d:
|
||||
file_path = Path(d) / "string.txt"
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
f.write(s)
|
||||
yield file_path
|
||||
|
||||
|
||||
def _examples_to_tokens(
|
||||
examples: Iterable[Example],
|
||||
) -> Tuple[List[List[str]], List[List[str]]]:
|
||||
reference = []
|
||||
predicted = []
|
||||
|
||||
for eg in examples:
|
||||
reference.append([t.text for t in eg.reference])
|
||||
predicted.append([t.text for t in eg.predicted])
|
||||
|
||||
return reference, predicted
|
|
@ -2,17 +2,19 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import pytest
|
||||
import srsly
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import Config
|
||||
from thinc.api import Config, get_current_ops
|
||||
|
||||
from spacy import util
|
||||
from spacy.lang.en import English
|
||||
from spacy.training.initialize import init_nlp
|
||||
from spacy.training.loop import train
|
||||
from spacy.training.pretrain import pretrain
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||
from spacy.ml.models.multi_task import create_pretrain_vectors
|
||||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from ..util import make_tempdir
|
||||
from ... import util
|
||||
from ...lang.en import English
|
||||
from ...training.initialize import init_nlp
|
||||
from ...training.loop import train
|
||||
from ...training.pretrain import pretrain
|
||||
from ...tokens import Doc, DocBin
|
||||
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||
|
||||
pretrain_string_listener = """
|
||||
[nlp]
|
||||
|
@ -163,7 +165,8 @@ def test_pretraining_default():
|
|||
|
||||
|
||||
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||
def test_pretraining_tok2vec_characters(objective):
|
||||
@pytest.mark.parametrize("skip_last", (True, False))
|
||||
def test_pretraining_tok2vec_characters(objective, skip_last):
|
||||
"""Test that pretraining works with the character objective"""
|
||||
config = Config().from_str(pretrain_string_listener)
|
||||
config["pretraining"]["objective"] = objective
|
||||
|
@ -176,10 +179,14 @@ def test_pretraining_tok2vec_characters(objective):
|
|||
filled["paths"]["raw_text"] = file_path
|
||||
filled = filled.interpolate()
|
||||
assert filled["pretraining"]["component"] == "tok2vec"
|
||||
pretrain(filled, tmp_dir)
|
||||
pretrain(filled, tmp_dir, skip_last=skip_last)
|
||||
assert Path(tmp_dir / "model0.bin").exists()
|
||||
assert Path(tmp_dir / "model4.bin").exists()
|
||||
assert not Path(tmp_dir / "model5.bin").exists()
|
||||
if skip_last:
|
||||
assert not Path(tmp_dir / "model-last.bin").exists()
|
||||
else:
|
||||
assert Path(tmp_dir / "model-last.bin").exists()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||
|
@ -235,6 +242,7 @@ def test_pretraining_tagger_tok2vec(config):
|
|||
pretrain(filled, tmp_dir)
|
||||
assert Path(tmp_dir / "model0.bin").exists()
|
||||
assert Path(tmp_dir / "model4.bin").exists()
|
||||
assert Path(tmp_dir / "model-last.bin").exists()
|
||||
assert not Path(tmp_dir / "model5.bin").exists()
|
||||
|
||||
|
||||
|
@ -346,3 +354,26 @@ def write_vectors_model(tmp_dir):
|
|||
nlp = English(vocab)
|
||||
nlp.to_disk(nlp_path)
|
||||
return str(nlp_path)
|
||||
|
||||
|
||||
def test_pretrain_default_vectors():
|
||||
nlp = English()
|
||||
nlp.add_pipe("tok2vec")
|
||||
nlp.initialize()
|
||||
|
||||
# default vectors are supported
|
||||
nlp.vocab.vectors = Vectors(shape=(10, 10))
|
||||
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||
|
||||
# floret vectors are supported
|
||||
nlp.vocab.vectors = Vectors(
|
||||
data=get_current_ops().xp.zeros((10, 10)), mode="floret", hash_count=1
|
||||
)
|
||||
create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model)
|
||||
|
||||
# error for no vectors
|
||||
with pytest.raises(ValueError, match="E875"):
|
||||
nlp.vocab.vectors = Vectors()
|
||||
create_pretrain_vectors(1, 1, "cosine")(
|
||||
nlp.vocab, nlp.get_pipe("tok2vec").model
|
||||
)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import numpy
|
||||
import tempfile
|
||||
import contextlib
|
||||
import re
|
||||
import srsly
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
|
|||
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
|
||||
assert k1 == k2
|
||||
assert v1 == v2
|
||||
|
||||
|
||||
def normalize_whitespace(s):
|
||||
return re.sub(r"\s+", " ", s)
|
||||
|
|
|
@ -834,10 +834,12 @@ cdef class Tokenizer:
|
|||
self.token_match = re.compile(data["token_match"]).match
|
||||
if "url_match" in data and isinstance(data["url_match"], str):
|
||||
self.url_match = re.compile(data["url_match"]).match
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
self.rules = data["rules"]
|
||||
if "faster_heuristics" in data:
|
||||
self.faster_heuristics = data["faster_heuristics"]
|
||||
# always load rules last so that all other settings are set before the
|
||||
# internal tokenization for the phrase matcher
|
||||
if "rules" in data and isinstance(data["rules"], dict):
|
||||
self.rules = data["rules"]
|
||||
return self
|
||||
|
||||
|
||||
|
|
|
@ -124,6 +124,10 @@ class DocBin:
|
|||
for key, group in doc.spans.items():
|
||||
for span in group:
|
||||
self.strings.add(span.label_)
|
||||
if span.kb_id in span.doc.vocab.strings:
|
||||
self.strings.add(span.kb_id_)
|
||||
if span.id in span.doc.vocab.strings:
|
||||
self.strings.add(span.id_)
|
||||
|
||||
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
|
||||
"""Recover Doc objects from the annotations, using the given vocab.
|
||||
|
|
|
@ -108,6 +108,7 @@ class Doc:
|
|||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
|
||||
@property
|
||||
|
|
|
@ -528,9 +528,9 @@ cdef class Doc:
|
|||
doc (Doc): The parent document.
|
||||
start_idx (int): The index of the first character of the span.
|
||||
end_idx (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
|
||||
named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
|
@ -539,14 +539,11 @@ cdef class Doc:
|
|||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#char_span
|
||||
"""
|
||||
if not isinstance(label, int):
|
||||
label = self.vocab.strings.add(label)
|
||||
if not isinstance(kb_id, int):
|
||||
kb_id = self.vocab.strings.add(kb_id)
|
||||
alignment_modes = ("strict", "contract", "expand")
|
||||
if alignment_mode not in alignment_modes:
|
||||
raise ValueError(
|
||||
|
@ -1349,6 +1346,10 @@ cdef class Doc:
|
|||
for group in self.spans.values():
|
||||
for span in group:
|
||||
strings.add(span.label_)
|
||||
if span.kb_id in span.doc.vocab.strings:
|
||||
strings.add(span.kb_id_)
|
||||
if span.id in span.doc.vocab.strings:
|
||||
strings.add(span.id_)
|
||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||
# vexing for user data. As a best guess, we *know* that within
|
||||
# keys, we must have tuples. In values we just have to hope
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Any, Dict, Iterator, List, Union
|
||||
from typing import Any, Dict, Iterator, List, Optional, Union
|
||||
from ..vocab import Vocab
|
||||
|
||||
class MorphAnalysis:
|
||||
|
@ -13,7 +13,7 @@ class MorphAnalysis:
|
|||
def __hash__(self) -> int: ...
|
||||
def __eq__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||
def __ne__(self, other: MorphAnalysis) -> bool: ... # type: ignore[override]
|
||||
def get(self, field: Any) -> List[str]: ...
|
||||
def get(self, field: Any, default: Optional[List[str]]) -> List[str]: ...
|
||||
def to_json(self) -> str: ...
|
||||
def to_dict(self) -> Dict[str, str]: ...
|
||||
def __str__(self) -> str: ...
|
||||
|
|
|
@ -58,10 +58,14 @@ cdef class MorphAnalysis:
|
|||
def __ne__(self, other):
|
||||
return self.key != other.key
|
||||
|
||||
def get(self, field):
|
||||
def get(self, field, default=None):
|
||||
"""Retrieve feature values by field."""
|
||||
cdef attr_t field_id = self.vocab.strings.as_int(field)
|
||||
cdef np.ndarray results = get_by_field(&self.c, field_id)
|
||||
if len(results) == 0:
|
||||
if default is None:
|
||||
default = []
|
||||
return default
|
||||
features = [self.vocab.strings[result] for result in results]
|
||||
return [f.split(Morphology.FIELD_SEP)[1] for f in features]
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
from typing import Callable, Protocol, Iterator, Optional, Union, Tuple, Any, overload
|
||||
from thinc.types import Floats1d, Ints2d, FloatsXd
|
||||
from typing import Any, Callable, Iterator, Optional, Protocol, Tuple, Union, overload
|
||||
|
||||
from thinc.types import Floats1d, FloatsXd, Ints2d
|
||||
|
||||
from ..lexeme import Lexeme
|
||||
from ..vocab import Vocab
|
||||
from .doc import Doc
|
||||
from .token import Token
|
||||
from .underscore import Underscore
|
||||
from ..lexeme import Lexeme
|
||||
from ..vocab import Vocab
|
||||
|
||||
class SpanMethod(Protocol):
|
||||
def __call__(self: Span, *args: Any, **kwargs: Any) -> Any: ... # type: ignore[misc]
|
||||
|
@ -51,7 +53,12 @@ class Span:
|
|||
kb_id: Union[str, int] = ...,
|
||||
span_id: Union[str, int] = ...,
|
||||
) -> None: ...
|
||||
def __richcmp__(self, other: Span, op: int) -> bool: ...
|
||||
def __lt__(self, other: Any) -> bool: ...
|
||||
def __le__(self, other: Any) -> bool: ...
|
||||
def __eq__(self, other: Any) -> bool: ...
|
||||
def __ne__(self, other: Any) -> bool: ...
|
||||
def __gt__(self, other: Any) -> bool: ...
|
||||
def __ge__(self, other: Any) -> bool: ...
|
||||
def __hash__(self) -> int: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
@ -98,6 +105,9 @@ class Span:
|
|||
label: Union[int, str] = ...,
|
||||
kb_id: Union[int, str] = ...,
|
||||
vector: Optional[Floats1d] = ...,
|
||||
id: Union[int, str] = ...,
|
||||
alignment_mode: str = ...,
|
||||
span_id: Union[int, str] = ...,
|
||||
) -> Span: ...
|
||||
@property
|
||||
def conjuncts(self) -> Tuple[Token]: ...
|
||||
|
|
|
@ -362,7 +362,7 @@ cdef class Span:
|
|||
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
|
||||
return result.item()
|
||||
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
|
@ -460,9 +460,12 @@ cdef class Span:
|
|||
start = i
|
||||
if start >= self.end:
|
||||
break
|
||||
if start < self.end:
|
||||
yield Span(self.doc, start, self.end)
|
||||
elif i == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
# Ensure that trailing parts of the Span instance are included in last element of .sents.
|
||||
if start == self.doc.length - 1:
|
||||
yield Span(self.doc, start, self.doc.length)
|
||||
|
||||
@property
|
||||
def ents(self):
|
||||
|
@ -639,21 +642,28 @@ cdef class Span:
|
|||
else:
|
||||
return self.doc[root]
|
||||
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
|
||||
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
|
||||
"""Create a `Span` object from the slice `span.text[start : end]`.
|
||||
|
||||
start (int): The index of the first character of the span.
|
||||
end (int): The index of the first character after the span.
|
||||
label (uint64 or string): A label to attach to the Span, e.g. for
|
||||
label (Union[int, str]): A label to attach to the Span, e.g. for
|
||||
named entities.
|
||||
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
|
||||
kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
|
||||
the span.
|
||||
id (Union[int, str]): Unused.
|
||||
alignment_mode (str): How character indices are aligned to token
|
||||
boundaries. Options: "strict" (character indices must be aligned
|
||||
with token boundaries), "contract" (span of all tokens completely
|
||||
within the character span), "expand" (span of all tokens at least
|
||||
partially covered by the character span). Defaults to "strict".
|
||||
span_id (Union[int, str]): An identifier to associate with the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
start_idx += self.c.start_char
|
||||
end_idx += self.c.start_char
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
|
||||
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
|
||||
|
||||
@property
|
||||
def conjuncts(self):
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from typing import Any, Dict, Iterable, Optional
|
||||
from typing import Any, Dict, Iterable, Iterator, Optional
|
||||
|
||||
from .doc import Doc
|
||||
from .span import Span
|
||||
|
||||
|
@ -18,7 +19,7 @@ class SpanGroup:
|
|||
def doc(self) -> Doc: ...
|
||||
@property
|
||||
def has_overlap(self) -> bool: ...
|
||||
def __iter__(self): ...
|
||||
def __iter__(self) -> Iterator[Span]: ...
|
||||
def __len__(self) -> int: ...
|
||||
def append(self, span: Span) -> None: ...
|
||||
def extend(self, spans: Iterable[Span]) -> None: ...
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .corpus import Corpus, JsonlCorpus # noqa: F401
|
||||
from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
|
||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||
from .alignment import Alignment # noqa: F401
|
||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||
|
|
|
@ -11,7 +11,7 @@ def create_copy_from_base_model(
|
|||
) -> Callable[[Language], Language]:
|
||||
def copy_from_base_model(nlp):
|
||||
if tokenizer:
|
||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
||||
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||
base_nlp = load_model(tokenizer)
|
||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||
|
@ -23,7 +23,7 @@ def create_copy_from_base_model(
|
|||
)
|
||||
)
|
||||
if vocab:
|
||||
logger.info(f"Copying vocab from: {vocab}")
|
||||
logger.info("Copying vocab from: %s", vocab)
|
||||
# only reload if the vocab is from a different model
|
||||
if tokenizer != vocab:
|
||||
base_nlp = load_model(vocab)
|
||||
|
|
|
@ -29,7 +29,7 @@ def create_docbin_reader(
|
|||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
util.logger.debug(f"Loading corpus from path: {path}")
|
||||
util.logger.debug("Loading corpus from path: %s", path)
|
||||
return Corpus(
|
||||
path,
|
||||
gold_preproc=gold_preproc,
|
||||
|
@ -58,6 +58,28 @@ def read_labels(path: Path, *, require: bool = False):
|
|||
return srsly.read_json(path)
|
||||
|
||||
|
||||
@util.registry.readers("spacy.PlainTextCorpus.v1")
|
||||
def create_plain_text_reader(
|
||||
path: Optional[Path],
|
||||
min_length: int = 0,
|
||||
max_length: int = 0,
|
||||
) -> Callable[["Language"], Iterable[Doc]]:
|
||||
"""Iterate Example objects from a file or directory of plain text
|
||||
UTF-8 files with one line per doc.
|
||||
|
||||
path (Path): The directory or filename to read from.
|
||||
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||
will be skipped. Defaults to 0, which indicates no limit.
|
||||
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||
be skipped. Defaults to 0, which indicates no limit.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#plaintextcorpus
|
||||
"""
|
||||
if path is None:
|
||||
raise ValueError(Errors.E913)
|
||||
return PlainTextCorpus(path, min_length=min_length, max_length=max_length)
|
||||
|
||||
|
||||
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
||||
path = util.ensure_path(path)
|
||||
if not path.is_dir() and path.parts[-1].endswith(file_type):
|
||||
|
@ -257,3 +279,52 @@ class JsonlCorpus:
|
|||
# We don't *need* an example here, but it seems nice to
|
||||
# make it match the Corpus signature.
|
||||
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
|
||||
|
||||
|
||||
class PlainTextCorpus:
|
||||
"""Iterate Example objects from a file or directory of plain text
|
||||
UTF-8 files with one line per doc.
|
||||
|
||||
path (Path): The directory or filename to read from.
|
||||
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||
will be skipped. Defaults to 0, which indicates no limit.
|
||||
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||
be skipped. Defaults to 0, which indicates no limit.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#plaintextcorpus
|
||||
"""
|
||||
|
||||
file_type = "txt"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: Optional[Union[str, Path]],
|
||||
*,
|
||||
min_length: int = 0,
|
||||
max_length: int = 0,
|
||||
) -> None:
|
||||
self.path = util.ensure_path(path)
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
|
||||
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||
"""Yield examples from the data.
|
||||
|
||||
nlp (Language): The current nlp object.
|
||||
YIELDS (Example): The example objects.
|
||||
|
||||
DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
|
||||
"""
|
||||
for loc in walk_corpus(self.path, ".txt"):
|
||||
with open(loc, encoding="utf-8") as f:
|
||||
for text in f:
|
||||
text = text.rstrip("\r\n")
|
||||
if len(text):
|
||||
doc = nlp.make_doc(text)
|
||||
if self.min_length >= 1 and len(doc) < self.min_length:
|
||||
continue
|
||||
elif self.max_length >= 1 and len(doc) > self.max_length:
|
||||
continue
|
||||
# We don't *need* an example here, but it seems nice to
|
||||
# make it match the Corpus signature.
|
||||
yield Example(doc, doc.copy())
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user