mirror of
https://github.com/explosion/spaCy.git
synced 2025-10-02 18:06:46 +03:00
Compare commits
No commits in common. "master" and "v3.7.0" have entirely different histories.
1
.github/FUNDING.yml
vendored
1
.github/FUNDING.yml
vendored
|
@ -1 +0,0 @@
|
||||||
custom: [https://explosion.ai/merch, https://explosion.ai/tailored-solutions]
|
|
99
.github/workflows/cibuildwheel.yml
vendored
99
.github/workflows/cibuildwheel.yml
vendored
|
@ -1,99 +0,0 @@
|
||||||
name: Build
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
# ytf did they invent their own syntax that's almost regex?
|
|
||||||
# ** matches 'zero or more of any character'
|
|
||||||
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
|
|
||||||
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
|
|
||||||
jobs:
|
|
||||||
build_wheels:
|
|
||||||
name: Build wheels on ${{ matrix.os }}
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
# macos-13 is an intel runner, macos-14 is apple silicon
|
|
||||||
os: [ubuntu-latest, windows-latest, macos-13, macos-14, ubuntu-24.04-arm]
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
# aarch64 (arm) is built via qemu emulation
|
|
||||||
# QEMU is sadly too slow. We need to wait for public ARM support
|
|
||||||
#- name: Set up QEMU
|
|
||||||
# if: runner.os == 'Linux'
|
|
||||||
# uses: docker/setup-qemu-action@v3
|
|
||||||
# with:
|
|
||||||
# platforms: all
|
|
||||||
- name: Build wheels
|
|
||||||
uses: pypa/cibuildwheel@v2.21.3
|
|
||||||
env:
|
|
||||||
CIBW_ARCHS_LINUX: auto
|
|
||||||
with:
|
|
||||||
package-dir: .
|
|
||||||
output-dir: wheelhouse
|
|
||||||
config-file: "{package}/pyproject.toml"
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
|
||||||
path: ./wheelhouse/*.whl
|
|
||||||
|
|
||||||
build_sdist:
|
|
||||||
name: Build source distribution
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Build sdist
|
|
||||||
run: pipx run build --sdist
|
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: cibw-sdist
|
|
||||||
path: dist/*.tar.gz
|
|
||||||
create_release:
|
|
||||||
needs: [build_wheels, build_sdist]
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
checks: write
|
|
||||||
actions: read
|
|
||||||
issues: read
|
|
||||||
packages: write
|
|
||||||
pull-requests: read
|
|
||||||
repository-projects: read
|
|
||||||
statuses: read
|
|
||||||
steps:
|
|
||||||
- name: Get the tag name and determine if it's a prerelease
|
|
||||||
id: get_tag_info
|
|
||||||
run: |
|
|
||||||
FULL_TAG=${GITHUB_REF#refs/tags/}
|
|
||||||
if [[ $FULL_TAG == release-* ]]; then
|
|
||||||
TAG_NAME=${FULL_TAG#release-}
|
|
||||||
IS_PRERELEASE=false
|
|
||||||
elif [[ $FULL_TAG == prerelease-* ]]; then
|
|
||||||
TAG_NAME=${FULL_TAG#prerelease-}
|
|
||||||
IS_PRERELEASE=true
|
|
||||||
else
|
|
||||||
echo "Tag does not match expected patterns" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
|
|
||||||
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
|
|
||||||
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
|
|
||||||
- uses: actions/download-artifact@v4
|
|
||||||
with:
|
|
||||||
# unpacks all CIBW artifacts into dist/
|
|
||||||
pattern: cibw-*
|
|
||||||
path: dist
|
|
||||||
merge-multiple: true
|
|
||||||
- name: Create Draft Release
|
|
||||||
id: create_release
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
if: startsWith(github.ref, 'refs/tags/')
|
|
||||||
env:
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
with:
|
|
||||||
name: ${{ env.TAG_NAME }}
|
|
||||||
draft: true
|
|
||||||
prerelease: ${{ env.IS_PRERELEASE }}
|
|
||||||
files: "./dist/*"
|
|
2
.github/workflows/explosionbot.yml
vendored
2
.github/workflows/explosionbot.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
run: echo "$GITHUB_CONTEXT"
|
run: echo "$GITHUB_CONTEXT"
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
- name: Install and run explosion-bot
|
- name: Install and run explosion-bot
|
||||||
run: |
|
run: |
|
||||||
|
|
2
.github/workflows/lock.yml
vendored
2
.github/workflows/lock.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v5
|
- uses: dessant/lock-threads@v4
|
||||||
with:
|
with:
|
||||||
process-only: 'issues'
|
process-only: 'issues'
|
||||||
issue-inactive-days: '30'
|
issue-inactive-days: '30'
|
||||||
|
|
29
.github/workflows/publish_pypi.yml
vendored
29
.github/workflows/publish_pypi.yml
vendored
|
@ -1,29 +0,0 @@
|
||||||
# The cibuildwheel action triggers on creation of a release, this
|
|
||||||
# triggers on publication.
|
|
||||||
# The expected workflow is to create a draft release and let the wheels
|
|
||||||
# upload, and then hit 'publish', which uploads to PyPi.
|
|
||||||
|
|
||||||
on:
|
|
||||||
release:
|
|
||||||
types:
|
|
||||||
- published
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
upload_pypi:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
environment:
|
|
||||||
name: pypi
|
|
||||||
url: https://pypi.org/p/spacy
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
if: github.event_name == 'release' && github.event.action == 'published'
|
|
||||||
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
|
|
||||||
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
||||||
steps:
|
|
||||||
- uses: robinraju/release-downloader@v1
|
|
||||||
with:
|
|
||||||
tag: ${{ github.event.release.tag_name }}
|
|
||||||
fileName: '*'
|
|
||||||
out-file-path: 'dist'
|
|
||||||
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
with:
|
with:
|
||||||
ref: ${{ matrix.branch }}
|
ref: ${{ matrix.branch }}
|
||||||
- name: Get commits from past 24 hours
|
- name: Get commits from past 24 hours
|
2
.github/workflows/spacy_universe_alert.yml
vendored
2
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "$GITHUB_CONTEXT"
|
echo "$GITHUB_CONTEXT"
|
||||||
|
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v3
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
37
.github/workflows/tests.yml
vendored
37
.github/workflows/tests.yml
vendored
|
@ -2,8 +2,6 @@ name: tests
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
tags-ignore:
|
|
||||||
- '**'
|
|
||||||
branches-ignore:
|
branches-ignore:
|
||||||
- "spacy.io"
|
- "spacy.io"
|
||||||
- "nightly.spacy.io"
|
- "nightly.spacy.io"
|
||||||
|
@ -12,6 +10,7 @@ on:
|
||||||
- "*.md"
|
- "*.md"
|
||||||
- "*.mdx"
|
- "*.mdx"
|
||||||
- "website/**"
|
- "website/**"
|
||||||
|
- ".github/workflows/**"
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened, edited]
|
types: [opened, synchronize, reopened, edited]
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
|
@ -26,12 +25,13 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.10"
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
- name: black
|
- name: black
|
||||||
run: |
|
run: |
|
||||||
|
@ -45,12 +45,11 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install flake8==5.0.4
|
python -m pip install flake8==5.0.4
|
||||||
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
|
||||||
# Unfortunately cython-lint isn't working after the shift to Cython 3.
|
- name: cython-lint
|
||||||
#- name: cython-lint
|
run: |
|
||||||
# run: |
|
python -m pip install cython-lint -c requirements.txt
|
||||||
# python -m pip install cython-lint -c requirements.txt
|
# E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
||||||
# # E501: line too log, W291: trailing whitespace, E266: too many leading '#' for block comment
|
cython-lint spacy --ignore E501,W291,E266
|
||||||
# cython-lint spacy --ignore E501,W291,E266
|
|
||||||
|
|
||||||
tests:
|
tests:
|
||||||
name: Test
|
name: Test
|
||||||
|
@ -59,18 +58,28 @@ jobs:
|
||||||
fail-fast: true
|
fail-fast: true
|
||||||
matrix:
|
matrix:
|
||||||
os: [ubuntu-latest, windows-latest, macos-latest]
|
os: [ubuntu-latest, windows-latest, macos-latest]
|
||||||
python_version: ["3.9", "3.12", "3.13"]
|
python_version: ["3.11", "3.12.0-rc.2"]
|
||||||
|
include:
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.7"
|
||||||
|
- os: macos-latest
|
||||||
|
python_version: "3.8"
|
||||||
|
- os: ubuntu-latest
|
||||||
|
python_version: "3.9"
|
||||||
|
- os: windows-latest
|
||||||
|
python_version: "3.10"
|
||||||
|
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
@ -148,9 +157,7 @@ jobs:
|
||||||
- name: "Test assemble CLI"
|
- name: "Test assemble CLI"
|
||||||
run: |
|
run: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
python -m spacy assemble ner_source_sm.cfg output_dir
|
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
env:
|
|
||||||
PYTHONWARNINGS: "error,ignore::DeprecationWarning"
|
|
||||||
if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test assemble CLI vectors warning"
|
- name: "Test assemble CLI vectors warning"
|
||||||
|
|
3
.github/workflows/universe_validation.yml
vendored
3
.github/workflows/universe_validation.yml
vendored
|
@ -20,12 +20,13 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.7"
|
python-version: "3.7"
|
||||||
|
architecture: x64
|
||||||
|
|
||||||
- name: Validate website/meta/universe.json
|
- name: Validate website/meta/universe.json
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -35,7 +35,7 @@ so that more people can benefit from it.
|
||||||
|
|
||||||
When opening an issue, use a **descriptive title** and include your
|
When opening an issue, use a **descriptive title** and include your
|
||||||
**environment** (operating system, Python version, spaCy version). Our
|
**environment** (operating system, Python version, spaCy version). Our
|
||||||
[issue templates](https://github.com/explosion/spaCy/issues/new/choose) help you
|
[issue template](https://github.com/explosion/spaCy/issues/new) helps you
|
||||||
remember the most important details to include. If you've discovered a bug, you
|
remember the most important details to include. If you've discovered a bug, you
|
||||||
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
can also submit a [regression test](#fixing-bugs) straight away. When you're
|
||||||
opening an issue to report the bug, simply refer to your pull request in the
|
opening an issue to report the bug, simply refer to your pull request in the
|
||||||
|
@ -449,12 +449,13 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
|
||||||
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
[`spacy`](https://github.com/topics/spacy?o=desc&s=stars) and
|
||||||
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
[`spacy-extensions`](https://github.com/topics/spacy-extension?o=desc&s=stars)
|
||||||
to make it easier to find. Those are also the topics we're linking to from the
|
to make it easier to find. Those are also the topics we're linking to from the
|
||||||
spaCy website. If you're sharing your project on X, feel free to tag
|
spaCy website. If you're sharing your project on Twitter, feel free to tag
|
||||||
[@spacy_io](https://x.com/spacy_io) so we can check it out.
|
[@spacy_io](https://twitter.com/spacy_io) so we can check it out.
|
||||||
|
|
||||||
- Once your extension is published, you can open a
|
- Once your extension is published, you can open an issue on the
|
||||||
[PR](https://github.com/explosion/spaCy/pulls) to suggest it for the
|
[issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the
|
||||||
[Universe](https://spacy.io/universe) page.
|
[resources directory](https://spacy.io/usage/resources#extensions) on the
|
||||||
|
website.
|
||||||
|
|
||||||
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**
|
||||||
|
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2022 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -4,6 +4,5 @@ include README.md
|
||||||
include pyproject.toml
|
include pyproject.toml
|
||||||
include spacy/py.typed
|
include spacy/py.typed
|
||||||
recursive-include spacy/cli *.yml
|
recursive-include spacy/cli *.yml
|
||||||
recursive-include spacy/tests *.json
|
|
||||||
recursive-include licenses *
|
recursive-include licenses *
|
||||||
recursive-exclude spacy *.cpp
|
recursive-exclude spacy *.cpp
|
||||||
|
|
23
README.md
23
README.md
|
@ -16,7 +16,7 @@ model packaging, deployment and workflow management. spaCy is commercial
|
||||||
open-source software, released under the
|
open-source software, released under the
|
||||||
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
[MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
|
||||||
|
|
||||||
💫 **Version 3.8 out now!**
|
💫 **Version 3.7 out now!**
|
||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||||
|
|
||||||
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
[](https://github.com/explosion/spaCy/actions/workflows/tests.yml)
|
||||||
|
@ -28,6 +28,7 @@ open-source software, released under the
|
||||||
<br />
|
<br />
|
||||||
[](https://pypi.org/project/spacy/)
|
[](https://pypi.org/project/spacy/)
|
||||||
[](https://anaconda.org/conda-forge/spacy)
|
[](https://anaconda.org/conda-forge/spacy)
|
||||||
|
[](https://twitter.com/spacy_io)
|
||||||
|
|
||||||
## 📖 Documentation
|
## 📖 Documentation
|
||||||
|
|
||||||
|
@ -38,37 +39,28 @@ open-source software, released under the
|
||||||
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
| 🚀 **[New in v3.0]** | New features, backwards incompatibilities and migration guide. |
|
||||||
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
| 🪐 **[Project Templates]** | End-to-end workflows you can clone, modify and run. |
|
||||||
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
| 🎛 **[API Reference]** | The detailed reference for spaCy's API. |
|
||||||
| ⏩ **[GPU Processing]** | Use spaCy with CUDA-compatible GPU processing. |
|
|
||||||
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
| 📦 **[Models]** | Download trained pipelines for spaCy. |
|
||||||
| 🦙 **[Large Language Models]** | Integrate LLMs into spaCy pipelines. |
|
|
||||||
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
| 🌌 **[Universe]** | Plugins, extensions, demos and books from the spaCy ecosystem. |
|
||||||
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
| ⚙️ **[spaCy VS Code Extension]** | Additional tooling and features for working with spaCy's config files. |
|
||||||
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
| 👩🏫 **[Online Course]** | Learn spaCy in this free and interactive online course. |
|
||||||
| 📰 **[Blog]** | Read about current spaCy and Prodigy development, releases, talks and more from Explosion. |
|
|
||||||
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
| 📺 **[Videos]** | Our YouTube channel with video tutorials, talks and more. |
|
||||||
| 🔴 **[Live Stream]** | Join Matt as he works on spaCy and chat about NLP, live every week. |
|
|
||||||
| 🛠 **[Changelog]** | Changes and version history. |
|
| 🛠 **[Changelog]** | Changes and version history. |
|
||||||
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
||||||
| 👕 **[Swag]** | Support us and our work with unique, custom-designed swag! |
|
| <a href="https://explosion.ai/spacy-tailored-pipelines"><img src="https://user-images.githubusercontent.com/13643239/152853098-1c761611-ccb0-4ec6-9066-b234552831fe.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
|
||||||
| <a href="https://explosion.ai/tailored-solutions"><img src="https://github.com/explosion/spaCy/assets/13643239/36d2a42e-98c0-4599-90e1-788ef75181be" width="150" alt="Tailored Solutions"/></a> | Custom NLP consulting, implementation and strategic advice by spaCy’s core development team. Streamlined, production-ready, predictable and maintainable. Send us an email or take our 5-minute questionnaire, and well'be in touch! **[Learn more →](https://explosion.ai/tailored-solutions)** |
|
| <a href="https://explosion.ai/spacy-tailored-analysis"><img src="https://user-images.githubusercontent.com/1019791/206151300-b00cd189-e503-4797-aa1e-1bb6344062c5.png" width="125" alt="spaCy Tailored Pipelines"/></a> | Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
|
||||||
|
|
||||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||||
[new in v3.0]: https://spacy.io/usage/v3
|
[new in v3.0]: https://spacy.io/usage/v3
|
||||||
[usage guides]: https://spacy.io/usage/
|
[usage guides]: https://spacy.io/usage/
|
||||||
[api reference]: https://spacy.io/api/
|
[api reference]: https://spacy.io/api/
|
||||||
[gpu processing]: https://spacy.io/usage#gpu
|
|
||||||
[models]: https://spacy.io/models
|
[models]: https://spacy.io/models
|
||||||
[large language models]: https://spacy.io/usage/large-language-models
|
|
||||||
[universe]: https://spacy.io/universe
|
[universe]: https://spacy.io/universe
|
||||||
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
[spacy vs code extension]: https://github.com/explosion/spacy-vscode
|
||||||
[videos]: https://www.youtube.com/c/ExplosionAI
|
[videos]: https://www.youtube.com/c/ExplosionAI
|
||||||
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
|
||||||
[online course]: https://course.spacy.io
|
[online course]: https://course.spacy.io
|
||||||
[blog]: https://explosion.ai
|
|
||||||
[project templates]: https://github.com/explosion/projects
|
[project templates]: https://github.com/explosion/projects
|
||||||
[changelog]: https://spacy.io/usage#changelog
|
[changelog]: https://spacy.io/usage#changelog
|
||||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||||
[swag]: https://explosion.ai/merch
|
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
|
|
||||||
|
@ -80,14 +72,13 @@ more people can benefit from it.
|
||||||
| Type | Platforms |
|
| Type | Platforms |
|
||||||
| ------------------------------- | --------------------------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||||
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] · [Live Stream] |
|
| 🎁 **Feature Requests & Ideas** | [GitHub Discussions] |
|
||||||
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
| 👩💻 **Usage Questions** | [GitHub Discussions] · [Stack Overflow] |
|
||||||
| 🗯 **General Discussion** | [GitHub Discussions] · [Live Stream] |
|
| 🗯 **General Discussion** | [GitHub Discussions] |
|
||||||
|
|
||||||
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
[github issue tracker]: https://github.com/explosion/spaCy/issues
|
||||||
[github discussions]: https://github.com/explosion/spaCy/discussions
|
[github discussions]: https://github.com/explosion/spaCy/discussions
|
||||||
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
[stack overflow]: https://stackoverflow.com/questions/tagged/spacy
|
||||||
[live stream]: https://www.youtube.com/playlist?list=PLBmcuObd5An5_iAxNYLJa_xWmNzsYce8c
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
@ -117,7 +108,7 @@ For detailed installation instructions, see the
|
||||||
|
|
||||||
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
|
||||||
Studio)
|
Studio)
|
||||||
- **Python version**: Python >=3.7, <3.13 (only 64 bit)
|
- **Python version**: Python 3.7+ (only 64 bit)
|
||||||
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
- **Package managers**: [pip] · [conda] (via `conda-forge`)
|
||||||
|
|
||||||
[pip]: https://pypi.org/project/spacy/
|
[pip]: https://pypi.org/project/spacy/
|
||||||
|
|
|
@ -1,20 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Insist repository is clean
|
|
||||||
git diff-index --quiet HEAD
|
|
||||||
|
|
||||||
version=$(grep "__version__ = " spacy/about.py)
|
|
||||||
version=${version/__version__ = }
|
|
||||||
version=${version/\'/}
|
|
||||||
version=${version/\'/}
|
|
||||||
version=${version/\"/}
|
|
||||||
version=${version/\"/}
|
|
||||||
|
|
||||||
echo "Pushing release-v"$version
|
|
||||||
|
|
||||||
git tag -d release-v$version || true
|
|
||||||
git push origin :release-v$version || true
|
|
||||||
git tag release-v$version
|
|
||||||
git push origin release-v$version
|
|
|
@ -1,2 +1,6 @@
|
||||||
# build version constraints for use with wheelwright
|
# build version constraints for use with wheelwright
|
||||||
numpy>=2.0.0,<3.0.0
|
numpy==1.15.0; python_version=='3.7' and platform_machine!='aarch64'
|
||||||
|
numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'
|
||||||
|
numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
|
||||||
|
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
|
||||||
|
numpy>=1.25.0; python_version>='3.9'
|
||||||
|
|
|
@ -158,45 +158,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
||||||
|
|
||||||
SciPy
|
|
||||||
-----
|
|
||||||
|
|
||||||
* Files: scorer.py
|
|
||||||
|
|
||||||
The implementation of trapezoid() is adapted from SciPy, which is distributed
|
|
||||||
under the following license:
|
|
||||||
|
|
||||||
New BSD License
|
|
||||||
|
|
||||||
Copyright (c) 2001-2002 Enthought, Inc. 2003-2023, SciPy Developers.
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions
|
|
||||||
are met:
|
|
||||||
|
|
||||||
1. Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
2. Redistributions in binary form must reproduce the above
|
|
||||||
copyright notice, this list of conditions and the following
|
|
||||||
disclaimer in the documentation and/or other materials provided
|
|
||||||
with the distribution.
|
|
||||||
|
|
||||||
3. Neither the name of the copyright holder nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived
|
|
||||||
from this software without specific prior written permission.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
||||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
||||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
|
|
|
@ -1,67 +1,15 @@
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = [
|
requires = [
|
||||||
"setuptools",
|
"setuptools",
|
||||||
"cython>=3.0,<4.0",
|
"cython>=0.25,<3.0",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.3.4,<8.4.0",
|
"thinc>=8.1.8,<8.3.0",
|
||||||
"numpy>=2.0.0,<3.0.0"
|
"numpy>=1.15.0; python_version < '3.9'",
|
||||||
|
"numpy>=1.25.0; python_version >= '3.9'",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[tool.cibuildwheel]
|
|
||||||
build = "*"
|
|
||||||
skip = "pp* cp36* cp37* cp38* *-win32 *i686*"
|
|
||||||
test-skip = ""
|
|
||||||
free-threaded-support = false
|
|
||||||
|
|
||||||
archs = ["native"]
|
|
||||||
|
|
||||||
build-frontend = "default"
|
|
||||||
config-settings = {}
|
|
||||||
dependency-versions = "pinned"
|
|
||||||
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
|
|
||||||
|
|
||||||
environment-pass = []
|
|
||||||
build-verbosity = 0
|
|
||||||
|
|
||||||
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
|
|
||||||
before-build = "pip install -r requirements.txt && python setup.py clean"
|
|
||||||
repair-wheel-command = ""
|
|
||||||
|
|
||||||
test-command = ""
|
|
||||||
before-test = ""
|
|
||||||
test-requires = []
|
|
||||||
test-extras = []
|
|
||||||
|
|
||||||
container-engine = "docker"
|
|
||||||
|
|
||||||
manylinux-x86_64-image = "manylinux2014"
|
|
||||||
manylinux-i686-image = "manylinux2014"
|
|
||||||
manylinux-aarch64-image = "manylinux2014"
|
|
||||||
manylinux-ppc64le-image = "manylinux2014"
|
|
||||||
manylinux-s390x-image = "manylinux2014"
|
|
||||||
manylinux-pypy_x86_64-image = "manylinux2014"
|
|
||||||
manylinux-pypy_i686-image = "manylinux2014"
|
|
||||||
manylinux-pypy_aarch64-image = "manylinux2014"
|
|
||||||
|
|
||||||
musllinux-x86_64-image = "musllinux_1_2"
|
|
||||||
musllinux-i686-image = "musllinux_1_2"
|
|
||||||
musllinux-aarch64-image = "musllinux_1_2"
|
|
||||||
musllinux-ppc64le-image = "musllinux_1_2"
|
|
||||||
musllinux-s390x-image = "musllinux_1_2"
|
|
||||||
|
|
||||||
[tool.cibuildwheel.linux]
|
|
||||||
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
|
|
||||||
|
|
||||||
[tool.cibuildwheel.macos]
|
|
||||||
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
|
|
||||||
|
|
||||||
[tool.cibuildwheel.windows]
|
|
||||||
|
|
||||||
[tool.cibuildwheel.pyodide]
|
|
||||||
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
profile = "black"
|
profile = "black"
|
||||||
|
|
|
@ -3,26 +3,31 @@ spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.3.4,<8.4.0
|
thinc>=8.1.8,<8.3.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer-slim>=0.3.0,<1.0.0
|
typer>=0.3.0,<0.10.0
|
||||||
weasel>=0.1.0,<0.5.0
|
pathy>=0.10.0
|
||||||
|
smart-open>=5.2.1,<7.0.0
|
||||||
|
weasel>=0.1.0,<0.4.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=2.0.0,<3.0.0
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
pydantic>=1.7.4,!=1.8,!=1.8.1,<3.0.0
|
||||||
jinja2
|
jinja2
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
pre-commit>=2.13.0
|
pre-commit>=2.13.0
|
||||||
cython>=3.0,<4.0
|
cython>=0.25,<3.0
|
||||||
pytest>=5.2.0,!=7.1.0
|
pytest>=5.2.0,!=7.1.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
|
|
26
setup.cfg
26
setup.cfg
|
@ -17,11 +17,11 @@ classifiers =
|
||||||
Operating System :: Microsoft :: Windows
|
Operating System :: Microsoft :: Windows
|
||||||
Programming Language :: Cython
|
Programming Language :: Cython
|
||||||
Programming Language :: Python :: 3
|
Programming Language :: Python :: 3
|
||||||
|
Programming Language :: Python :: 3.7
|
||||||
|
Programming Language :: Python :: 3.8
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
Programming Language :: Python :: 3.10
|
Programming Language :: Python :: 3.10
|
||||||
Programming Language :: Python :: 3.11
|
Programming Language :: Python :: 3.11
|
||||||
Programming Language :: Python :: 3.12
|
|
||||||
Programming Language :: Python :: 3.13
|
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -30,18 +30,18 @@ project_urls =
|
||||||
[options]
|
[options]
|
||||||
zip_safe = false
|
zip_safe = false
|
||||||
include_package_data = true
|
include_package_data = true
|
||||||
python_requires = >=3.9,<3.14
|
python_requires = >=3.7
|
||||||
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
# NOTE: This section is superseded by pyproject.toml and will be removed in
|
||||||
# spaCy v4
|
# spaCy v4
|
||||||
setup_requires =
|
setup_requires =
|
||||||
cython>=3.0,<4.0
|
cython>=0.25,<3.0
|
||||||
numpy>=2.0.0,<3.0.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
numpy>=2.0.0,<3.0.0; python_version >= "3.9"
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
# We also need our Cython packages here to compile against
|
# We also need our Cython packages here to compile against
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.3.4,<8.4.0
|
thinc>=8.1.8,<8.3.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.11,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
|
@ -49,13 +49,15 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.3.4,<8.4.0
|
thinc>=8.1.8,<8.3.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
weasel>=0.1.0,<0.5.0
|
weasel>=0.1.0,<0.4.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer-slim>=0.3.0,<1.0.0
|
typer>=0.3.0,<0.10.0
|
||||||
|
pathy>=0.10.0
|
||||||
|
smart-open>=5.2.1,<7.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
numpy>=1.19.0; python_version >= "3.9"
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
|
@ -65,6 +67,8 @@ install_requires =
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
|
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||||
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
console_scripts =
|
console_scripts =
|
||||||
|
@ -114,7 +118,7 @@ cuda12x =
|
||||||
cuda-autodetect =
|
cuda-autodetect =
|
||||||
cupy-wheel>=11.0.0,<13.0.0
|
cupy-wheel>=11.0.0,<13.0.0
|
||||||
apple =
|
apple =
|
||||||
thinc-apple-ops>=1.0.0,<2.0.0
|
thinc-apple-ops>=0.1.0.dev0,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.5.2,!=0.6.1
|
sudachipy>=0.5.2,!=0.6.1
|
||||||
|
|
|
@ -13,11 +13,9 @@ from thinc.api import Config, prefer_gpu, require_cpu, require_gpu # noqa: F401
|
||||||
from . import pipeline # noqa: F401
|
from . import pipeline # noqa: F401
|
||||||
from . import util
|
from . import util
|
||||||
from .about import __version__ # noqa: F401
|
from .about import __version__ # noqa: F401
|
||||||
from .cli.info import info # noqa: F401
|
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .glossary import explain # noqa: F401
|
from .glossary import explain # noqa: F401
|
||||||
from .language import Language
|
from .language import Language
|
||||||
from .registrations import REGISTRY_POPULATED, populate_registry
|
|
||||||
from .util import logger, registry # noqa: F401
|
from .util import logger, registry # noqa: F401
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
|
||||||
|
@ -78,3 +76,9 @@ def blank(
|
||||||
# We should accept both dot notation and nested dict here for consistency
|
# We should accept both dot notation and nested dict here for consistency
|
||||||
config = util.dot_to_dict(config)
|
config = util.dot_to_dict(config)
|
||||||
return LangClass.from_config(config, vocab=vocab, meta=meta)
|
return LangClass.from_config(config, vocab=vocab, meta=meta)
|
||||||
|
|
||||||
|
|
||||||
|
def info(*args, **kwargs):
|
||||||
|
from .cli.info import info as cli_info
|
||||||
|
|
||||||
|
return cli_info(*args, **kwargs)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.8.7"
|
__version__ = "3.7.0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
|
|
||||||
# Needed for testing
|
|
||||||
from . import download as download_module # noqa: F401
|
|
||||||
from ._util import app, setup_cli # noqa: F401
|
from ._util import app, setup_cli # noqa: F401
|
||||||
from .apply import apply # noqa: F401
|
from .apply import apply # noqa: F401
|
||||||
from .assemble import assemble_cli # noqa: F401
|
from .assemble import assemble_cli # noqa: F401
|
||||||
|
@ -24,17 +22,8 @@ from .init_pipeline import init_pipeline_cli # noqa: F401
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
from .pretrain import pretrain # noqa: F401
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .profile import profile # noqa: F401
|
from .profile import profile # noqa: F401
|
||||||
from .project.assets import project_assets # type: ignore[attr-defined] # noqa: F401
|
from .train import train_cli # noqa: F401
|
||||||
from .project.clone import project_clone # type: ignore[attr-defined] # noqa: F401
|
from .validate import validate # noqa: F401
|
||||||
from .project.document import ( # type: ignore[attr-defined] # noqa: F401
|
|
||||||
project_document,
|
|
||||||
)
|
|
||||||
from .project.dvc import project_update_dvc # type: ignore[attr-defined] # noqa: F401
|
|
||||||
from .project.pull import project_pull # type: ignore[attr-defined] # noqa: F401
|
|
||||||
from .project.push import project_push # type: ignore[attr-defined] # noqa: F401
|
|
||||||
from .project.run import project_run # type: ignore[attr-defined] # noqa: F401
|
|
||||||
from .train import train_cli # type: ignore[attr-defined] # noqa: F401
|
|
||||||
from .validate import validate # type: ignore[attr-defined] # noqa: F401
|
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -41,6 +41,10 @@ from ..util import (
|
||||||
run_command,
|
run_command,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathy import FluidPath # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
SDIST_SUFFIX = ".tar.gz"
|
SDIST_SUFFIX = ".tar.gz"
|
||||||
WHEEL_SUFFIX = "-py3-none-any.whl"
|
WHEEL_SUFFIX = "-py3-none-any.whl"
|
||||||
|
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .. import util
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..training import Corpus
|
from ..training import Corpus
|
||||||
from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
|
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
||||||
|
|
||||||
|
|
||||||
@benchmark_cli.command(
|
@benchmark_cli.command(
|
||||||
|
@ -30,14 +30,12 @@ def benchmark_speed_cli(
|
||||||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||||
data in the binary .spacy format.
|
data in the binary .spacy format.
|
||||||
"""
|
"""
|
||||||
import_code(code_path)
|
|
||||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||||
|
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
|
@ -173,5 +171,5 @@ def print_outliers(sample: numpy.ndarray):
|
||||||
def warmup(
|
def warmup(
|
||||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||||
) -> numpy.ndarray:
|
) -> numpy.ndarray:
|
||||||
docs = [doc.copy() for doc in docs * warmup_epochs]
|
docs = warmup_epochs * docs
|
||||||
return annotate(nlp, docs, batch_size)
|
return annotate(nlp, docs, batch_size)
|
||||||
|
|
|
@ -170,7 +170,7 @@ def debug_model(
|
||||||
msg.divider(f"STEP 3 - prediction")
|
msg.divider(f"STEP 3 - prediction")
|
||||||
msg.info(str(prediction))
|
msg.info(str(prediction))
|
||||||
|
|
||||||
msg.good(f"Successfully ended analysis - model looks good.")
|
msg.good(f"Succesfully ended analysis - model looks good.")
|
||||||
|
|
||||||
|
|
||||||
def _sentences():
|
def _sentences():
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional, Sequence
|
from typing import Optional, Sequence
|
||||||
from urllib.parse import urljoin
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import typer
|
import typer
|
||||||
|
@ -8,14 +7,7 @@ from wasabi import msg
|
||||||
|
|
||||||
from .. import about
|
from .. import about
|
||||||
from ..errors import OLD_MODEL_SHORTCUTS
|
from ..errors import OLD_MODEL_SHORTCUTS
|
||||||
from ..util import (
|
from ..util import get_minor_version, is_package, is_prerelease_version, run_command
|
||||||
get_minor_version,
|
|
||||||
is_in_interactive,
|
|
||||||
is_in_jupyter,
|
|
||||||
is_package,
|
|
||||||
is_prerelease_version,
|
|
||||||
run_command,
|
|
||||||
)
|
|
||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app
|
||||||
|
|
||||||
|
|
||||||
|
@ -64,13 +56,6 @@ def download(
|
||||||
)
|
)
|
||||||
pip_args = pip_args + ("--no-deps",)
|
pip_args = pip_args + ("--no-deps",)
|
||||||
if direct:
|
if direct:
|
||||||
# Reject model names with '/', in order to prevent shenanigans.
|
|
||||||
if "/" in model:
|
|
||||||
msg.fail(
|
|
||||||
title="Model download rejected",
|
|
||||||
text=f"Cannot download model '{model}'. Models are expected to be file names, not URLs or fragments",
|
|
||||||
exits=True,
|
|
||||||
)
|
|
||||||
components = model.split("-")
|
components = model.split("-")
|
||||||
model_name = "".join(components[:-1])
|
model_name = "".join(components[:-1])
|
||||||
version = components[-1]
|
version = components[-1]
|
||||||
|
@ -92,27 +77,6 @@ def download(
|
||||||
"Download and installation successful",
|
"Download and installation successful",
|
||||||
f"You can now load the package via spacy.load('{model_name}')",
|
f"You can now load the package via spacy.load('{model_name}')",
|
||||||
)
|
)
|
||||||
if is_in_jupyter():
|
|
||||||
reload_deps_msg = (
|
|
||||||
"If you are in a Jupyter or Colab notebook, you may need to "
|
|
||||||
"restart Python in order to load all the package's dependencies. "
|
|
||||||
"You can do this by selecting the 'Restart kernel' or 'Restart "
|
|
||||||
"runtime' option."
|
|
||||||
)
|
|
||||||
msg.warn(
|
|
||||||
"Restart to reload dependencies",
|
|
||||||
reload_deps_msg,
|
|
||||||
)
|
|
||||||
elif is_in_interactive():
|
|
||||||
reload_deps_msg = (
|
|
||||||
"If you are in an interactive Python session, you may need to "
|
|
||||||
"exit and restart Python to load all the package's dependencies. "
|
|
||||||
"You can exit with Ctrl-D (or Ctrl-Z and Enter on Windows)."
|
|
||||||
)
|
|
||||||
msg.warn(
|
|
||||||
"Restart to reload dependencies",
|
|
||||||
reload_deps_msg,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
|
@ -161,16 +125,7 @@ def get_latest_version(model: str) -> str:
|
||||||
def download_model(
|
def download_model(
|
||||||
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
filename: str, user_pip_args: Optional[Sequence[str]] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
# Construct the download URL carefully. We need to make sure we don't
|
download_url = about.__download_url__ + "/" + filename
|
||||||
# allow relative paths or other shenanigans to trick us into download
|
|
||||||
# from outside our own repo.
|
|
||||||
base_url = about.__download_url__
|
|
||||||
# urljoin requires that the path ends with /, or the last path part will be dropped
|
|
||||||
if not base_url.endswith("/"):
|
|
||||||
base_url = about.__download_url__ + "/"
|
|
||||||
download_url = urljoin(base_url, filename)
|
|
||||||
if not download_url.startswith(about.__download_url__):
|
|
||||||
raise ValueError(f"Download from {filename} rejected. Was it a relative path?")
|
|
||||||
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
pip_args = list(user_pip_args) if user_pip_args is not None else []
|
||||||
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]
|
||||||
run_command(cmd)
|
run_command(cmd)
|
||||||
|
|
|
@ -39,7 +39,7 @@ def find_threshold_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for a trained model with varying thresholds to maximize
|
Runs prediction trials for a trained model with varying tresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
@ -81,7 +81,7 @@ def find_threshold(
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Tuple[float, float, Dict[float, float]]:
|
) -> Tuple[float, float, Dict[float, float]]:
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for models with varying thresholds to maximize the specified metric.
|
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
||||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||||
pipe_name (str): Name of pipe to examine thresholds for.
|
pipe_name (str): Name of pipe to examine thresholds for.
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
import os
|
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -13,7 +11,6 @@ from thinc.api import Config
|
||||||
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
from wasabi import MarkdownRenderer, Printer, get_raw_input
|
||||||
|
|
||||||
from .. import about, util
|
from .. import about, util
|
||||||
from ..compat import importlib_metadata
|
|
||||||
from ..schemas import ModelMetaSchema, validate
|
from ..schemas import ModelMetaSchema, validate
|
||||||
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
from ._util import SDIST_SUFFIX, WHEEL_SUFFIX, Arg, Opt, app, string_to_list
|
||||||
|
|
||||||
|
@ -30,7 +27,6 @@ def package_cli(
|
||||||
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
|
||||||
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
|
||||||
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
|
||||||
require_parent: bool = Opt(True, "--require-parent/--no-require-parent", "-R", "-R", help="Include the parent package (e.g. spacy) in the requirements"),
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
@ -39,7 +35,7 @@ def package_cli(
|
||||||
specified output directory, and the data will be copied over. If
|
specified output directory, and the data will be copied over. If
|
||||||
--create-meta is set and a meta.json already exists in the output directory,
|
--create-meta is set and a meta.json already exists in the output directory,
|
||||||
the existing values will be used as the defaults in the command-line prompt.
|
the existing values will be used as the defaults in the command-line prompt.
|
||||||
After packaging, "python -m build --sdist" is run in the package directory,
|
After packaging, "python setup.py sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
If additional code files are provided (e.g. Python files containing custom
|
If additional code files are provided (e.g. Python files containing custom
|
||||||
|
@ -61,7 +57,6 @@ def package_cli(
|
||||||
create_sdist=create_sdist,
|
create_sdist=create_sdist,
|
||||||
create_wheel=create_wheel,
|
create_wheel=create_wheel,
|
||||||
force=force,
|
force=force,
|
||||||
require_parent=require_parent,
|
|
||||||
silent=False,
|
silent=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -76,7 +71,6 @@ def package(
|
||||||
create_meta: bool = False,
|
create_meta: bool = False,
|
||||||
create_sdist: bool = True,
|
create_sdist: bool = True,
|
||||||
create_wheel: bool = False,
|
create_wheel: bool = False,
|
||||||
require_parent: bool = False,
|
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -84,17 +78,9 @@ def package(
|
||||||
input_path = util.ensure_path(input_dir)
|
input_path = util.ensure_path(input_dir)
|
||||||
output_path = util.ensure_path(output_dir)
|
output_path = util.ensure_path(output_dir)
|
||||||
meta_path = util.ensure_path(meta_path)
|
meta_path = util.ensure_path(meta_path)
|
||||||
if create_wheel and not has_wheel() and not has_build():
|
if create_wheel and not has_wheel():
|
||||||
err = (
|
err = "Generating a binary .whl file requires wheel to be installed"
|
||||||
"Generating wheels requires 'build' or 'wheel' (deprecated) to be installed"
|
msg.fail(err, "pip install wheel", exits=1)
|
||||||
)
|
|
||||||
msg.fail(err, "pip install build", exits=1)
|
|
||||||
if not has_build():
|
|
||||||
msg.warn(
|
|
||||||
"Generating packages without the 'build' package is deprecated and "
|
|
||||||
"will not be supported in the future. To install 'build': pip "
|
|
||||||
"install build"
|
|
||||||
)
|
|
||||||
if not input_path or not input_path.exists():
|
if not input_path or not input_path.exists():
|
||||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
|
@ -116,7 +102,7 @@ def package(
|
||||||
if not meta_path.exists() or not meta_path.is_file():
|
if not meta_path.exists() or not meta_path.is_file():
|
||||||
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
|
||||||
meta = srsly.read_json(meta_path)
|
meta = srsly.read_json(meta_path)
|
||||||
meta = get_meta(input_dir, meta, require_parent=require_parent)
|
meta = get_meta(input_dir, meta)
|
||||||
if meta["requirements"]:
|
if meta["requirements"]:
|
||||||
msg.good(
|
msg.good(
|
||||||
f"Including {len(meta['requirements'])} package requirement(s) from "
|
f"Including {len(meta['requirements'])} package requirement(s) from "
|
||||||
|
@ -189,7 +175,6 @@ def package(
|
||||||
imports.append(code_path.stem)
|
imports.append(code_path.stem)
|
||||||
shutil.copy(str(code_path), str(package_path))
|
shutil.copy(str(code_path), str(package_path))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
|
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
init_py = TEMPLATE_INIT.format(
|
init_py = TEMPLATE_INIT.format(
|
||||||
|
@ -199,37 +184,12 @@ def package(
|
||||||
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
# run directly, since util.run_command is not designed to continue
|
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
||||||
# after a command fails
|
|
||||||
ret = subprocess.run(
|
|
||||||
[sys.executable, "-m", "build", ".", "--sdist"],
|
|
||||||
env=os.environ.copy(),
|
|
||||||
)
|
|
||||||
if ret.returncode != 0:
|
|
||||||
msg.warn(
|
|
||||||
"Creating sdist with 'python -m build' failed. Falling "
|
|
||||||
"back to deprecated use of 'python setup.py sdist'"
|
|
||||||
)
|
|
||||||
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
|
|
||||||
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
|
||||||
msg.good(f"Successfully created zipped Python package", zip_file)
|
msg.good(f"Successfully created zipped Python package", zip_file)
|
||||||
if create_wheel:
|
if create_wheel:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
# run directly, since util.run_command is not designed to continue
|
util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
|
||||||
# after a command fails
|
|
||||||
ret = subprocess.run(
|
|
||||||
[sys.executable, "-m", "build", ".", "--wheel"],
|
|
||||||
env=os.environ.copy(),
|
|
||||||
)
|
|
||||||
if ret.returncode != 0:
|
|
||||||
msg.warn(
|
|
||||||
"Creating wheel with 'python -m build' failed. Falling "
|
|
||||||
"back to deprecated use of 'wheel' with "
|
|
||||||
"'python setup.py bdist_wheel'"
|
|
||||||
)
|
|
||||||
util.run_command(
|
|
||||||
[sys.executable, "setup.py", "bdist_wheel"], capture=False
|
|
||||||
)
|
|
||||||
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
wheel_name_squashed = re.sub("_+", "_", model_name_v)
|
||||||
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
|
||||||
msg.good(f"Successfully created binary wheel", wheel)
|
msg.good(f"Successfully created binary wheel", wheel)
|
||||||
|
@ -249,17 +209,6 @@ def has_wheel() -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def has_build() -> bool:
|
|
||||||
# it's very likely that there is a local directory named build/ (especially
|
|
||||||
# in an editable install), so an import check is not sufficient; instead
|
|
||||||
# check that there is a package version
|
|
||||||
try:
|
|
||||||
importlib_metadata.version("build")
|
|
||||||
return True
|
|
||||||
except importlib_metadata.PackageNotFoundError: # type: ignore[attr-defined]
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_third_party_dependencies(
|
def get_third_party_dependencies(
|
||||||
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
config: Config, exclude: List[str] = util.SimpleFrozenList()
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
|
@ -306,8 +255,6 @@ def get_third_party_dependencies(
|
||||||
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
modules.add(func_info["module"].split(".")[0]) # type: ignore[union-attr]
|
||||||
dependencies = []
|
dependencies = []
|
||||||
for module_name in modules:
|
for module_name in modules:
|
||||||
if module_name == about.__title__:
|
|
||||||
continue
|
|
||||||
if module_name in distributions:
|
if module_name in distributions:
|
||||||
dist = distributions.get(module_name)
|
dist = distributions.get(module_name)
|
||||||
if dist:
|
if dist:
|
||||||
|
@ -338,9 +285,7 @@ def create_file(file_path: Path, contents: str) -> None:
|
||||||
|
|
||||||
|
|
||||||
def get_meta(
|
def get_meta(
|
||||||
model_path: Union[str, Path],
|
model_path: Union[str, Path], existing_meta: Dict[str, Any]
|
||||||
existing_meta: Dict[str, Any],
|
|
||||||
require_parent: bool = False,
|
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
meta: Dict[str, Any] = {
|
meta: Dict[str, Any] = {
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
|
@ -369,8 +314,6 @@ def get_meta(
|
||||||
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
existing_reqs = [util.split_requirement(req)[0] for req in meta["requirements"]]
|
||||||
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
reqs = get_third_party_dependencies(nlp.config, exclude=existing_reqs)
|
||||||
meta["requirements"].extend(reqs)
|
meta["requirements"].extend(reqs)
|
||||||
if require_parent and about.__title__ not in meta["requirements"]:
|
|
||||||
meta["requirements"].append(about.__title__ + meta["spacy_version"])
|
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
@ -545,11 +488,8 @@ def list_files(data_dir):
|
||||||
|
|
||||||
|
|
||||||
def list_requirements(meta):
|
def list_requirements(meta):
|
||||||
# Up to version 3.7, we included the parent package
|
parent_package = meta.get('parent_package', 'spacy')
|
||||||
# in requirements by default. This behaviour is removed
|
requirements = [parent_package + meta['spacy_version']]
|
||||||
# in 3.8, with a setting to include the parent package in
|
|
||||||
# the requirements list in the meta if desired.
|
|
||||||
requirements = []
|
|
||||||
if 'setup_requires' in meta:
|
if 'setup_requires' in meta:
|
||||||
requirements += meta['setup_requires']
|
requirements += meta['setup_requires']
|
||||||
if 'requirements' in meta:
|
if 'requirements' in meta:
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.assets import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.clone import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.document import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.dvc import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.pull import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.push import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.remote_storage import *
|
|
|
@ -1 +0,0 @@
|
||||||
from weasel.cli.run import *
|
|
|
@ -271,9 +271,8 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
length = 262144
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -309,9 +308,8 @@ grad_factor = 1.0
|
||||||
@layers = "reduce_mean.v1"
|
@layers = "reduce_mean.v1"
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
length = 262144
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
|
@ -544,15 +542,14 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat.model.linear_model]
|
[components.textcat.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
length = 262144
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat.model]
|
[components.textcat.model]
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = true
|
exclusive_classes = true
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
@ -573,17 +570,15 @@ nO = null
|
||||||
width = ${components.tok2vec.model.encode.width}
|
width = ${components.tok2vec.model.encode.width}
|
||||||
|
|
||||||
[components.textcat_multilabel.model.linear_model]
|
[components.textcat_multilabel.model.linear_model]
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
length = 262144
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
|
|
||||||
{% else -%}
|
{% else -%}
|
||||||
[components.textcat_multilabel.model]
|
[components.textcat_multilabel.model]
|
||||||
@architectures = "spacy.TextCatBOW.v3"
|
@architectures = "spacy.TextCatBOW.v2"
|
||||||
exclusive_classes = false
|
exclusive_classes = false
|
||||||
length = 262144
|
|
||||||
ngram_size = 1
|
ngram_size = 1
|
||||||
no_output_layer = false
|
no_output_layer = false
|
||||||
{%- endif %}
|
{%- endif %}
|
||||||
|
|
|
@ -142,25 +142,7 @@ class SpanRenderer:
|
||||||
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
spans (list): Individual entity spans and their start, end, label, kb_id and kb_url.
|
||||||
title (str / None): Document title set in Doc.user_data['title'].
|
title (str / None): Document title set in Doc.user_data['title'].
|
||||||
"""
|
"""
|
||||||
per_token_info = self._assemble_per_token_info(tokens, spans)
|
per_token_info = []
|
||||||
markup = self._render_markup(per_token_info)
|
|
||||||
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
|
||||||
if title:
|
|
||||||
markup = TPL_TITLE.format(title=title) + markup
|
|
||||||
return markup
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _assemble_per_token_info(
|
|
||||||
tokens: List[str], spans: List[Dict[str, Any]]
|
|
||||||
) -> List[Dict[str, List[Dict[str, Any]]]]:
|
|
||||||
"""Assembles token info used to generate markup in render_spans().
|
|
||||||
tokens (List[str]): Tokens in text.
|
|
||||||
spans (List[Dict[str, Any]]): Spans in text.
|
|
||||||
RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens
|
|
||||||
and spans.
|
|
||||||
"""
|
|
||||||
per_token_info: List[Dict[str, List[Dict[str, Any]]]] = []
|
|
||||||
|
|
||||||
# we must sort so that we can correctly describe when spans need to "stack"
|
# we must sort so that we can correctly describe when spans need to "stack"
|
||||||
# which is determined by their start token, then span length (longer spans on top),
|
# which is determined by their start token, then span length (longer spans on top),
|
||||||
# then break any remaining ties with the span label
|
# then break any remaining ties with the span label
|
||||||
|
@ -172,22 +154,21 @@ class SpanRenderer:
|
||||||
s["label"],
|
s["label"],
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
for s in spans:
|
for s in spans:
|
||||||
# this is the vertical 'slot' that the span will be rendered in
|
# this is the vertical 'slot' that the span will be rendered in
|
||||||
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
# vertical_position = span_label_offset + (offset_step * (slot - 1))
|
||||||
s["render_slot"] = 0
|
s["render_slot"] = 0
|
||||||
|
|
||||||
for idx, token in enumerate(tokens):
|
for idx, token in enumerate(tokens):
|
||||||
# Identify if a token belongs to a Span (and which) and if it's a
|
# Identify if a token belongs to a Span (and which) and if it's a
|
||||||
# start token of said Span. We'll use this for the final HTML render
|
# start token of said Span. We'll use this for the final HTML render
|
||||||
token_markup: Dict[str, Any] = {}
|
token_markup: Dict[str, Any] = {}
|
||||||
token_markup["text"] = token
|
token_markup["text"] = token
|
||||||
intersecting_spans: List[Dict[str, Any]] = []
|
concurrent_spans = 0
|
||||||
entities = []
|
entities = []
|
||||||
for span in spans:
|
for span in spans:
|
||||||
ent = {}
|
ent = {}
|
||||||
if span["start_token"] <= idx < span["end_token"]:
|
if span["start_token"] <= idx < span["end_token"]:
|
||||||
|
concurrent_spans += 1
|
||||||
span_start = idx == span["start_token"]
|
span_start = idx == span["start_token"]
|
||||||
ent["label"] = span["label"]
|
ent["label"] = span["label"]
|
||||||
ent["is_start"] = span_start
|
ent["is_start"] = span_start
|
||||||
|
@ -195,12 +176,7 @@ class SpanRenderer:
|
||||||
# When the span starts, we need to know how many other
|
# When the span starts, we need to know how many other
|
||||||
# spans are on the 'span stack' and will be rendered.
|
# spans are on the 'span stack' and will be rendered.
|
||||||
# This value becomes the vertical render slot for this entire span
|
# This value becomes the vertical render slot for this entire span
|
||||||
span["render_slot"] = (
|
span["render_slot"] = concurrent_spans
|
||||||
intersecting_spans[-1]["render_slot"]
|
|
||||||
if len(intersecting_spans)
|
|
||||||
else 0
|
|
||||||
) + 1
|
|
||||||
intersecting_spans.append(span)
|
|
||||||
ent["render_slot"] = span["render_slot"]
|
ent["render_slot"] = span["render_slot"]
|
||||||
kb_id = span.get("kb_id", "")
|
kb_id = span.get("kb_id", "")
|
||||||
kb_url = span.get("kb_url", "#")
|
kb_url = span.get("kb_url", "#")
|
||||||
|
@ -217,8 +193,11 @@ class SpanRenderer:
|
||||||
span["render_slot"] = 0
|
span["render_slot"] = 0
|
||||||
token_markup["entities"] = entities
|
token_markup["entities"] = entities
|
||||||
per_token_info.append(token_markup)
|
per_token_info.append(token_markup)
|
||||||
|
markup = self._render_markup(per_token_info)
|
||||||
return per_token_info
|
markup = TPL_SPANS.format(content=markup, dir=self.direction)
|
||||||
|
if title:
|
||||||
|
markup = TPL_TITLE.format(title=title) + markup
|
||||||
|
return markup
|
||||||
|
|
||||||
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str:
|
||||||
"""Render the markup from per-token information"""
|
"""Render the markup from per-token information"""
|
||||||
|
|
|
@ -220,7 +220,6 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
"key attribute for vectors, configure it through Vectors(attr=) or "
|
"key attribute for vectors, configure it through Vectors(attr=) or "
|
||||||
"'spacy init vectors --attr'")
|
"'spacy init vectors --attr'")
|
||||||
W126 = ("These keys are unsupported: {unsupported}")
|
W126 = ("These keys are unsupported: {unsupported}")
|
||||||
W127 = ("Not all `Language.pipe` worker processes completed successfully")
|
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
|
@ -228,6 +227,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
E002 = ("Can't find factory for '{name}' for language {lang} ({lang_code}). "
|
||||||
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
"This usually happens when spaCy calls `nlp.{method}` with a custom "
|
||||||
"component name that's not registered on the current language class. "
|
"component name that's not registered on the current language class. "
|
||||||
|
"If you're using a Transformer, make sure to install 'spacy-transformers'. "
|
||||||
"If you're using a custom component, make sure you've added the "
|
"If you're using a custom component, make sure you've added the "
|
||||||
"decorator `@Language.component` (for function components) or "
|
"decorator `@Language.component` (for function components) or "
|
||||||
"`@Language.factory` (for class components).\n\nAvailable "
|
"`@Language.factory` (for class components).\n\nAvailable "
|
||||||
|
@ -984,10 +984,6 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
"predicted docs when training {component}.")
|
"predicted docs when training {component}.")
|
||||||
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
|
||||||
"but only callbacks with one or three parameters are supported")
|
"but only callbacks with one or three parameters are supported")
|
||||||
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
|
|
||||||
E1057 = ("The `TextCatReduce` architecture must be used with at least one "
|
|
||||||
"reduction. Please enable one of `use_reduce_first`, "
|
|
||||||
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
|
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -1,11 +1,3 @@
|
||||||
from .candidate import Candidate, get_candidates, get_candidates_batch
|
from .candidate import Candidate, get_candidates, get_candidates_batch
|
||||||
from .kb import KnowledgeBase
|
from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"Candidate",
|
|
||||||
"KnowledgeBase",
|
|
||||||
"InMemoryLookupKB",
|
|
||||||
"get_candidates",
|
|
||||||
"get_candidates_batch",
|
|
||||||
]
|
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
from ...language import BaseDefaults, Language
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class TibetanDefaults(BaseDefaults):
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Tibetan(Language):
|
|
||||||
lang = "bo"
|
|
||||||
Defaults = TibetanDefaults
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Tibetan"]
|
|
|
@ -1,16 +0,0 @@
|
||||||
"""
|
|
||||||
Example sentences to test spaCy and its language models.
|
|
||||||
|
|
||||||
>>> from spacy.lang.bo.examples import sentences
|
|
||||||
>>> docs = nlp.pipe(sentences)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
|
||||||
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
|
|
||||||
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
|
|
||||||
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
|
|
||||||
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
|
|
||||||
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
|
|
||||||
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
|
|
||||||
]
|
|
|
@ -1,65 +0,0 @@
|
||||||
from ...attrs import LIKE_NUM
|
|
||||||
|
|
||||||
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
|
|
||||||
|
|
||||||
_num_words = [
|
|
||||||
"ཀླད་ཀོར་",
|
|
||||||
"གཅིག་",
|
|
||||||
"གཉིས་",
|
|
||||||
"གསུམ་",
|
|
||||||
"བཞི་",
|
|
||||||
"ལྔ་",
|
|
||||||
"དྲུག་",
|
|
||||||
"བདུན་",
|
|
||||||
"བརྒྱད་",
|
|
||||||
"དགུ་",
|
|
||||||
"བཅུ་",
|
|
||||||
"བཅུ་གཅིག་",
|
|
||||||
"བཅུ་གཉིས་",
|
|
||||||
"བཅུ་གསུམ་",
|
|
||||||
"བཅུ་བཞི་",
|
|
||||||
"བཅུ་ལྔ་",
|
|
||||||
"བཅུ་དྲུག་",
|
|
||||||
"བཅུ་བདུན་",
|
|
||||||
"བཅུ་པརྒྱད",
|
|
||||||
"བཅུ་དགུ་",
|
|
||||||
"ཉི་ཤུ་",
|
|
||||||
"སུམ་ཅུ",
|
|
||||||
"བཞི་བཅུ",
|
|
||||||
"ལྔ་བཅུ",
|
|
||||||
"དྲུག་ཅུ",
|
|
||||||
"བདུན་ཅུ",
|
|
||||||
"བརྒྱད་ཅུ",
|
|
||||||
"དགུ་བཅུ",
|
|
||||||
"བརྒྱ་",
|
|
||||||
"སྟོང་",
|
|
||||||
"ཁྲི་",
|
|
||||||
"ས་ཡ་",
|
|
||||||
" བྱེ་བ་",
|
|
||||||
"དུང་ཕྱུར་",
|
|
||||||
"ཐེར་འབུམ་",
|
|
||||||
"ཐེར་འབུམ་ཆེན་པོ་",
|
|
||||||
"ཁྲག་ཁྲིག་",
|
|
||||||
"ཁྲག་ཁྲིག་ཆེན་པོ་",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
|
||||||
"""
|
|
||||||
Check if text resembles a number
|
|
||||||
"""
|
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
|
||||||
text = text[1:]
|
|
||||||
text = text.replace(",", "").replace(".", "")
|
|
||||||
if text.isdigit():
|
|
||||||
return True
|
|
||||||
if text.count("/") == 1:
|
|
||||||
num, denom = text.split("/")
|
|
||||||
if num.isdigit() and denom.isdigit():
|
|
||||||
return True
|
|
||||||
if text in _num_words:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
|
@ -1,198 +0,0 @@
|
||||||
# Source: https://zenodo.org/records/10148636
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
|
||||||
"""
|
|
||||||
འི་
|
|
||||||
།
|
|
||||||
དུ་
|
|
||||||
གིས་
|
|
||||||
སོགས་
|
|
||||||
ཏེ
|
|
||||||
གི་
|
|
||||||
རྣམས་
|
|
||||||
ནི
|
|
||||||
ཀུན་
|
|
||||||
ཡི་
|
|
||||||
འདི
|
|
||||||
ཀྱི་
|
|
||||||
སྙེད་
|
|
||||||
པས་
|
|
||||||
གཞན་
|
|
||||||
ཀྱིས་
|
|
||||||
ཡི
|
|
||||||
ལ
|
|
||||||
ནི་
|
|
||||||
དང་
|
|
||||||
སོགས
|
|
||||||
ཅིང་
|
|
||||||
ར
|
|
||||||
དུ
|
|
||||||
མི་
|
|
||||||
སུ་
|
|
||||||
བཅས་
|
|
||||||
ཡོངས་
|
|
||||||
ལས
|
|
||||||
ཙམ་
|
|
||||||
གྱིས་
|
|
||||||
དེ་
|
|
||||||
ཡང་
|
|
||||||
མཐའ་དག་
|
|
||||||
ཏུ་
|
|
||||||
ཉིད་
|
|
||||||
ས
|
|
||||||
ཏེ་
|
|
||||||
གྱི་
|
|
||||||
སྤྱི
|
|
||||||
དེ
|
|
||||||
ཀ་
|
|
||||||
ཡིན་
|
|
||||||
ཞིང་
|
|
||||||
འདི་
|
|
||||||
རུང་
|
|
||||||
རང་
|
|
||||||
ཞིག་
|
|
||||||
སྟེ
|
|
||||||
སྟེ་
|
|
||||||
ན་རེ
|
|
||||||
ངམ
|
|
||||||
ཤིང་
|
|
||||||
དག་
|
|
||||||
ཏོ
|
|
||||||
རེ་
|
|
||||||
འང་
|
|
||||||
ཀྱང་
|
|
||||||
ལགས་པ
|
|
||||||
ཚུ
|
|
||||||
དོ
|
|
||||||
ཡིན་པ
|
|
||||||
རེ
|
|
||||||
ན་རེ་
|
|
||||||
ཨེ་
|
|
||||||
ཚང་མ
|
|
||||||
ཐམས་ཅད་
|
|
||||||
དམ་
|
|
||||||
འོ་
|
|
||||||
ཅིག་
|
|
||||||
གྱིན་
|
|
||||||
ཡིན
|
|
||||||
ན
|
|
||||||
ཁོ་ན་
|
|
||||||
འམ་
|
|
||||||
ཀྱིན་
|
|
||||||
ལོ
|
|
||||||
ཀྱིས
|
|
||||||
བས་
|
|
||||||
ལགས་
|
|
||||||
ཤིག
|
|
||||||
གིས
|
|
||||||
ཀི་
|
|
||||||
སྣ་ཚོགས་
|
|
||||||
རྣམས
|
|
||||||
སྙེད་པ
|
|
||||||
ཡིས་
|
|
||||||
གྱི
|
|
||||||
གི
|
|
||||||
བམ་
|
|
||||||
ཤིག་
|
|
||||||
རེ་རེ་
|
|
||||||
ནམ
|
|
||||||
མིན་
|
|
||||||
ནམ་
|
|
||||||
ངམ་
|
|
||||||
རུ་
|
|
||||||
འགའ་
|
|
||||||
ཀུན
|
|
||||||
ཤས་
|
|
||||||
ཏུ
|
|
||||||
ཡིས
|
|
||||||
གིན་
|
|
||||||
གམ་
|
|
||||||
འོ
|
|
||||||
ཡིན་པ་
|
|
||||||
མིན
|
|
||||||
ལགས
|
|
||||||
གྱིས
|
|
||||||
ཅང་
|
|
||||||
འགའ
|
|
||||||
སམ་
|
|
||||||
ཞིག
|
|
||||||
འང
|
|
||||||
ལས་ཆེ་
|
|
||||||
འཕྲལ་
|
|
||||||
བར་
|
|
||||||
རུ
|
|
||||||
དང
|
|
||||||
ཡ
|
|
||||||
འག
|
|
||||||
སམ
|
|
||||||
ཀ
|
|
||||||
ཅུང་ཟད་
|
|
||||||
ཅིག
|
|
||||||
ཉིད
|
|
||||||
དུ་མ
|
|
||||||
མ
|
|
||||||
ཡིན་བ
|
|
||||||
འམ
|
|
||||||
མམ
|
|
||||||
དམ
|
|
||||||
དག
|
|
||||||
ཁོ་ན
|
|
||||||
ཀྱི
|
|
||||||
ལམ
|
|
||||||
ཕྱི་
|
|
||||||
ནང་
|
|
||||||
ཙམ
|
|
||||||
ནོ་
|
|
||||||
སོ་
|
|
||||||
རམ་
|
|
||||||
བོ་
|
|
||||||
ཨང་
|
|
||||||
ཕྱི
|
|
||||||
ཏོ་
|
|
||||||
ཚོ
|
|
||||||
ལ་ལ་
|
|
||||||
ཚོ་
|
|
||||||
ཅིང
|
|
||||||
མ་གི་
|
|
||||||
གེ
|
|
||||||
གོ
|
|
||||||
ཡིན་ལུགས་
|
|
||||||
རོ་
|
|
||||||
བོ
|
|
||||||
ལགས་པ་
|
|
||||||
པས
|
|
||||||
རབ་
|
|
||||||
འི
|
|
||||||
རམ
|
|
||||||
བས
|
|
||||||
གཞན
|
|
||||||
སྙེད་པ་
|
|
||||||
འབའ་
|
|
||||||
མཾ་
|
|
||||||
པོ
|
|
||||||
ག་
|
|
||||||
ག
|
|
||||||
གམ
|
|
||||||
སྤྱི་
|
|
||||||
བམ
|
|
||||||
མོ་
|
|
||||||
ཙམ་པ་
|
|
||||||
ཤ་སྟག་
|
|
||||||
མམ་
|
|
||||||
རེ་རེ
|
|
||||||
སྙེད
|
|
||||||
ཏམ་
|
|
||||||
ངོ
|
|
||||||
གྲང་
|
|
||||||
ཏ་རེ
|
|
||||||
ཏམ
|
|
||||||
ཁ་
|
|
||||||
ངེ་
|
|
||||||
ཅོག་
|
|
||||||
རིལ་
|
|
||||||
ཉུང་ཤས་
|
|
||||||
གིང་
|
|
||||||
ཚ་
|
|
||||||
ཀྱང
|
|
||||||
""".split()
|
|
||||||
)
|
|
|
@ -6,8 +6,7 @@ _num_words = [
|
||||||
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
|
||||||
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
"sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty",
|
||||||
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
"fifty", "sixty", "seventy", "eighty", "ninety", "hundred", "thousand",
|
||||||
"million", "billion", "trillion", "quadrillion", "quintillion", "sextillion",
|
"million", "billion", "trillion", "quadrillion", "gajillion", "bazillion"
|
||||||
"septillion", "octillion", "nonillion", "decillion", "gajillion", "bazillion"
|
|
||||||
]
|
]
|
||||||
_ordinal_words = [
|
_ordinal_words = [
|
||||||
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
"first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
|
||||||
|
@ -15,8 +14,7 @@ _ordinal_words = [
|
||||||
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
"fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth",
|
||||||
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
"twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth",
|
||||||
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
"eightieth", "ninetieth", "hundredth", "thousandth", "millionth", "billionth",
|
||||||
"trillionth", "quadrillionth", "quintillionth", "sextillionth", "septillionth",
|
"trillionth", "quadrillionth", "gajillionth", "bazillionth"
|
||||||
"octillionth", "nonillionth", "decillionth", "gajillionth", "bazillionth"
|
|
||||||
]
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
from ...language import BaseDefaults, Language
|
|
||||||
from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
|
|
||||||
|
|
||||||
class FaroeseDefaults(BaseDefaults):
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
||||||
infixes = TOKENIZER_INFIXES
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
|
||||||
|
|
||||||
|
|
||||||
class Faroese(Language):
|
|
||||||
lang = "fo"
|
|
||||||
Defaults = FaroeseDefaults
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Faroese"]
|
|
|
@ -1,90 +0,0 @@
|
||||||
from ...symbols import ORTH
|
|
||||||
from ...util import update_exc
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
|
|
||||||
_exc = {}
|
|
||||||
|
|
||||||
for orth in [
|
|
||||||
"apr.",
|
|
||||||
"aug.",
|
|
||||||
"avgr.",
|
|
||||||
"árg.",
|
|
||||||
"ávís.",
|
|
||||||
"beinl.",
|
|
||||||
"blkv.",
|
|
||||||
"blaðkv.",
|
|
||||||
"blm.",
|
|
||||||
"blaðm.",
|
|
||||||
"bls.",
|
|
||||||
"blstj.",
|
|
||||||
"blaðstj.",
|
|
||||||
"des.",
|
|
||||||
"eint.",
|
|
||||||
"febr.",
|
|
||||||
"fyrrv.",
|
|
||||||
"góðk.",
|
|
||||||
"h.m.",
|
|
||||||
"innt.",
|
|
||||||
"jan.",
|
|
||||||
"kl.",
|
|
||||||
"m.a.",
|
|
||||||
"mðr.",
|
|
||||||
"mió.",
|
|
||||||
"nr.",
|
|
||||||
"nto.",
|
|
||||||
"nov.",
|
|
||||||
"nút.",
|
|
||||||
"o.a.",
|
|
||||||
"o.a.m.",
|
|
||||||
"o.a.tíl.",
|
|
||||||
"o.fl.",
|
|
||||||
"ff.",
|
|
||||||
"o.m.a.",
|
|
||||||
"o.o.",
|
|
||||||
"o.s.fr.",
|
|
||||||
"o.tíl.",
|
|
||||||
"o.ø.",
|
|
||||||
"okt.",
|
|
||||||
"omf.",
|
|
||||||
"pst.",
|
|
||||||
"ritstj.",
|
|
||||||
"sbr.",
|
|
||||||
"sms.",
|
|
||||||
"smst.",
|
|
||||||
"smb.",
|
|
||||||
"sb.",
|
|
||||||
"sbrt.",
|
|
||||||
"sp.",
|
|
||||||
"sept.",
|
|
||||||
"spf.",
|
|
||||||
"spsk.",
|
|
||||||
"t.e.",
|
|
||||||
"t.s.",
|
|
||||||
"t.s.s.",
|
|
||||||
"tlf.",
|
|
||||||
"tel.",
|
|
||||||
"tsk.",
|
|
||||||
"t.o.v.",
|
|
||||||
"t.d.",
|
|
||||||
"uml.",
|
|
||||||
"ums.",
|
|
||||||
"uppl.",
|
|
||||||
"upprfr.",
|
|
||||||
"uppr.",
|
|
||||||
"útg.",
|
|
||||||
"útl.",
|
|
||||||
"útr.",
|
|
||||||
"vanl.",
|
|
||||||
"v.",
|
|
||||||
"v.h.",
|
|
||||||
"v.ø.o.",
|
|
||||||
"viðm.",
|
|
||||||
"viðv.",
|
|
||||||
"vm.",
|
|
||||||
"v.m.",
|
|
||||||
]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
|
||||||
capitalized = orth.capitalize()
|
|
||||||
_exc[capitalized] = [{ORTH: capitalized}]
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
|
@ -1,18 +0,0 @@
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from ...language import BaseDefaults, Language
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
|
|
||||||
|
|
||||||
class ScottishDefaults(BaseDefaults):
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class Scottish(Language):
|
|
||||||
lang = "gd"
|
|
||||||
Defaults = ScottishDefaults
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Scottish"]
|
|
|
@ -1,388 +0,0 @@
|
||||||
STOP_WORDS = set(
|
|
||||||
"""
|
|
||||||
'ad
|
|
||||||
'ar
|
|
||||||
'd # iad
|
|
||||||
'g # ag
|
|
||||||
'ga
|
|
||||||
'gam
|
|
||||||
'gan
|
|
||||||
'gar
|
|
||||||
'gur
|
|
||||||
'm # am
|
|
||||||
'n # an
|
|
||||||
'n seo
|
|
||||||
'na
|
|
||||||
'nad
|
|
||||||
'nam
|
|
||||||
'nan
|
|
||||||
'nar
|
|
||||||
'nuair
|
|
||||||
'nur
|
|
||||||
's
|
|
||||||
'sa
|
|
||||||
'san
|
|
||||||
'sann
|
|
||||||
'se
|
|
||||||
'sna
|
|
||||||
a
|
|
||||||
a'
|
|
||||||
a'd # agad
|
|
||||||
a'm # agam
|
|
||||||
a-chèile
|
|
||||||
a-seo
|
|
||||||
a-sin
|
|
||||||
a-siud
|
|
||||||
a chionn
|
|
||||||
a chionn 's
|
|
||||||
a chèile
|
|
||||||
a chéile
|
|
||||||
a dh'
|
|
||||||
a h-uile
|
|
||||||
a seo
|
|
||||||
ac' # aca
|
|
||||||
aca
|
|
||||||
aca-san
|
|
||||||
acasan
|
|
||||||
ach
|
|
||||||
ag
|
|
||||||
agad
|
|
||||||
agad-sa
|
|
||||||
agads'
|
|
||||||
agadsa
|
|
||||||
agaibh
|
|
||||||
agaibhse
|
|
||||||
againn
|
|
||||||
againne
|
|
||||||
agam
|
|
||||||
agam-sa
|
|
||||||
agams'
|
|
||||||
agamsa
|
|
||||||
agus
|
|
||||||
aice
|
|
||||||
aice-se
|
|
||||||
aicese
|
|
||||||
aig
|
|
||||||
aig' # aige
|
|
||||||
aige
|
|
||||||
aige-san
|
|
||||||
aigesan
|
|
||||||
air
|
|
||||||
air-san
|
|
||||||
air neo
|
|
||||||
airsan
|
|
||||||
am
|
|
||||||
an
|
|
||||||
an seo
|
|
||||||
an sin
|
|
||||||
an siud
|
|
||||||
an uair
|
|
||||||
ann
|
|
||||||
ann a
|
|
||||||
ann a'
|
|
||||||
ann a shin
|
|
||||||
ann am
|
|
||||||
ann an
|
|
||||||
annad
|
|
||||||
annam
|
|
||||||
annam-s'
|
|
||||||
annamsa
|
|
||||||
anns
|
|
||||||
anns an
|
|
||||||
annta
|
|
||||||
aon
|
|
||||||
ar
|
|
||||||
as
|
|
||||||
asad
|
|
||||||
asda
|
|
||||||
asta
|
|
||||||
b'
|
|
||||||
bho
|
|
||||||
bhon
|
|
||||||
bhuaidhe # bhuaithe
|
|
||||||
bhuainn
|
|
||||||
bhuaipe
|
|
||||||
bhuaithe
|
|
||||||
bhuapa
|
|
||||||
bhur
|
|
||||||
brì
|
|
||||||
bu
|
|
||||||
c'à
|
|
||||||
car son
|
|
||||||
carson
|
|
||||||
cha
|
|
||||||
chan
|
|
||||||
chionn
|
|
||||||
choir
|
|
||||||
chon
|
|
||||||
chun
|
|
||||||
chèile
|
|
||||||
chéile
|
|
||||||
chòir
|
|
||||||
cia mheud
|
|
||||||
ciamar
|
|
||||||
co-dhiubh
|
|
||||||
cuide
|
|
||||||
cuin
|
|
||||||
cuin'
|
|
||||||
cuine
|
|
||||||
cà
|
|
||||||
cà'
|
|
||||||
càil
|
|
||||||
càit
|
|
||||||
càit'
|
|
||||||
càite
|
|
||||||
cò
|
|
||||||
cò mheud
|
|
||||||
có
|
|
||||||
d'
|
|
||||||
da
|
|
||||||
de
|
|
||||||
dh'
|
|
||||||
dha
|
|
||||||
dhaibh
|
|
||||||
dhaibh-san
|
|
||||||
dhaibhsan
|
|
||||||
dhan
|
|
||||||
dhasan
|
|
||||||
dhe
|
|
||||||
dhen
|
|
||||||
dheth
|
|
||||||
dhi
|
|
||||||
dhiom
|
|
||||||
dhiot
|
|
||||||
dhith
|
|
||||||
dhiubh
|
|
||||||
dhomh
|
|
||||||
dhomh-s'
|
|
||||||
dhomhsa
|
|
||||||
dhu'sa # dhut-sa
|
|
||||||
dhuibh
|
|
||||||
dhuibhse
|
|
||||||
dhuinn
|
|
||||||
dhuinne
|
|
||||||
dhuit
|
|
||||||
dhut
|
|
||||||
dhutsa
|
|
||||||
dhut-sa
|
|
||||||
dhà
|
|
||||||
dhà-san
|
|
||||||
dhàsan
|
|
||||||
dhòmhsa
|
|
||||||
diubh
|
|
||||||
do
|
|
||||||
docha
|
|
||||||
don
|
|
||||||
dà
|
|
||||||
dè
|
|
||||||
dè mar
|
|
||||||
dé
|
|
||||||
dé mar
|
|
||||||
dòch'
|
|
||||||
dòcha
|
|
||||||
e
|
|
||||||
eadar
|
|
||||||
eatarra
|
|
||||||
eatorra
|
|
||||||
eile
|
|
||||||
esan
|
|
||||||
fa
|
|
||||||
far
|
|
||||||
feud
|
|
||||||
fhad
|
|
||||||
fheudar
|
|
||||||
fhearr
|
|
||||||
fhein
|
|
||||||
fheudar
|
|
||||||
fheàrr
|
|
||||||
fhèin
|
|
||||||
fhéin
|
|
||||||
fhìn
|
|
||||||
fo
|
|
||||||
fodha
|
|
||||||
fodhainn
|
|
||||||
foipe
|
|
||||||
fon
|
|
||||||
fèin
|
|
||||||
ga
|
|
||||||
gach
|
|
||||||
gam
|
|
||||||
gan
|
|
||||||
ge brith
|
|
||||||
ged
|
|
||||||
gu
|
|
||||||
gu dè
|
|
||||||
gu ruige
|
|
||||||
gun
|
|
||||||
gur
|
|
||||||
gus
|
|
||||||
i
|
|
||||||
iad
|
|
||||||
iadsan
|
|
||||||
innte
|
|
||||||
is
|
|
||||||
ise
|
|
||||||
le
|
|
||||||
leam
|
|
||||||
leam-sa
|
|
||||||
leamsa
|
|
||||||
leat
|
|
||||||
leat-sa
|
|
||||||
leatha
|
|
||||||
leatsa
|
|
||||||
leibh
|
|
||||||
leis
|
|
||||||
leis-san
|
|
||||||
leoth'
|
|
||||||
leotha
|
|
||||||
leotha-san
|
|
||||||
linn
|
|
||||||
m'
|
|
||||||
m'a
|
|
||||||
ma
|
|
||||||
mac
|
|
||||||
man
|
|
||||||
mar
|
|
||||||
mas
|
|
||||||
mathaid
|
|
||||||
mi
|
|
||||||
mis'
|
|
||||||
mise
|
|
||||||
mo
|
|
||||||
mu
|
|
||||||
mu 'n
|
|
||||||
mun
|
|
||||||
mur
|
|
||||||
mura
|
|
||||||
mus
|
|
||||||
na
|
|
||||||
na b'
|
|
||||||
na bu
|
|
||||||
na iad
|
|
||||||
nach
|
|
||||||
nad
|
|
||||||
nam
|
|
||||||
nan
|
|
||||||
nar
|
|
||||||
nas
|
|
||||||
neo
|
|
||||||
no
|
|
||||||
nuair
|
|
||||||
o
|
|
||||||
o'n
|
|
||||||
oir
|
|
||||||
oirbh
|
|
||||||
oirbh-se
|
|
||||||
oirnn
|
|
||||||
oirnne
|
|
||||||
oirre
|
|
||||||
on
|
|
||||||
orm
|
|
||||||
orm-sa
|
|
||||||
ormsa
|
|
||||||
orra
|
|
||||||
orra-san
|
|
||||||
orrasan
|
|
||||||
ort
|
|
||||||
os
|
|
||||||
r'
|
|
||||||
ri
|
|
||||||
ribh
|
|
||||||
rinn
|
|
||||||
ris
|
|
||||||
rithe
|
|
||||||
rithe-se
|
|
||||||
rium
|
|
||||||
rium-sa
|
|
||||||
riums'
|
|
||||||
riumsa
|
|
||||||
riut
|
|
||||||
riuth'
|
|
||||||
riutha
|
|
||||||
riuthasan
|
|
||||||
ro
|
|
||||||
ro'n
|
|
||||||
roimh
|
|
||||||
roimhe
|
|
||||||
romhainn
|
|
||||||
romham
|
|
||||||
romhpa
|
|
||||||
ron
|
|
||||||
ruibh
|
|
||||||
ruinn
|
|
||||||
ruinne
|
|
||||||
sa
|
|
||||||
san
|
|
||||||
sann
|
|
||||||
se
|
|
||||||
seach
|
|
||||||
seo
|
|
||||||
seothach
|
|
||||||
shin
|
|
||||||
sibh
|
|
||||||
sibh-se
|
|
||||||
sibhse
|
|
||||||
sin
|
|
||||||
sineach
|
|
||||||
sinn
|
|
||||||
sinne
|
|
||||||
siod
|
|
||||||
siodach
|
|
||||||
siud
|
|
||||||
siudach
|
|
||||||
sna # ann an
|
|
||||||
sè
|
|
||||||
t'
|
|
||||||
tarsaing
|
|
||||||
tarsainn
|
|
||||||
tarsuinn
|
|
||||||
thar
|
|
||||||
thoigh
|
|
||||||
thro
|
|
||||||
thu
|
|
||||||
thuc'
|
|
||||||
thuca
|
|
||||||
thugad
|
|
||||||
thugaibh
|
|
||||||
thugainn
|
|
||||||
thugam
|
|
||||||
thugamsa
|
|
||||||
thuice
|
|
||||||
thuige
|
|
||||||
thus'
|
|
||||||
thusa
|
|
||||||
timcheall
|
|
||||||
toigh
|
|
||||||
toil
|
|
||||||
tro
|
|
||||||
tro' # troimh
|
|
||||||
troimh
|
|
||||||
troimhe
|
|
||||||
tron
|
|
||||||
tu
|
|
||||||
tusa
|
|
||||||
uair
|
|
||||||
ud
|
|
||||||
ugaibh
|
|
||||||
ugam-s'
|
|
||||||
ugam-sa
|
|
||||||
uice
|
|
||||||
uige
|
|
||||||
uige-san
|
|
||||||
umad
|
|
||||||
unnta # ann an
|
|
||||||
ur
|
|
||||||
urrainn
|
|
||||||
à
|
|
||||||
às
|
|
||||||
àsan
|
|
||||||
á
|
|
||||||
ás
|
|
||||||
è
|
|
||||||
ì
|
|
||||||
ò
|
|
||||||
ó
|
|
||||||
""".split(
|
|
||||||
"\n"
|
|
||||||
)
|
|
||||||
)
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,5 +1,5 @@
|
||||||
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
The list of Croatian lemmas was extracted from the reldi-tagger repository (https://github.com/clarinsi/reldi-tagger).
|
||||||
Reldi-tagger is licensed under the Apache 2.0 licence.
|
Reldi-tagger is licesned under the Apache 2.0 licence.
|
||||||
|
|
||||||
@InProceedings{ljubesic16-new,
|
@InProceedings{ljubesic16-new,
|
||||||
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
author = {Nikola Ljubešić and Filip Klubička and Željko Agić and Ivo-Pavao Jazbec},
|
||||||
|
|
|
@ -1,52 +0,0 @@
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
from thinc.api import Model
|
|
||||||
|
|
||||||
from ...language import BaseDefaults, Language
|
|
||||||
from .lemmatizer import HaitianCreoleLemmatizer
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
from .tag_map import TAG_MAP
|
|
||||||
|
|
||||||
|
|
||||||
class HaitianCreoleDefaults(BaseDefaults):
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
|
||||||
infixes = TOKENIZER_INFIXES
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
tag_map = TAG_MAP
|
|
||||||
|
|
||||||
class HaitianCreole(Language):
|
|
||||||
lang = "ht"
|
|
||||||
Defaults = HaitianCreoleDefaults
|
|
||||||
|
|
||||||
@HaitianCreole.factory(
|
|
||||||
"lemmatizer",
|
|
||||||
assigns=["token.lemma"],
|
|
||||||
default_config={
|
|
||||||
"model": None,
|
|
||||||
"mode": "rule",
|
|
||||||
"overwrite": False,
|
|
||||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
|
||||||
},
|
|
||||||
default_score_weights={"lemma_acc": 1.0},
|
|
||||||
)
|
|
||||||
def make_lemmatizer(
|
|
||||||
nlp: Language,
|
|
||||||
model: Optional[Model],
|
|
||||||
name: str,
|
|
||||||
mode: str,
|
|
||||||
overwrite: bool,
|
|
||||||
scorer: Optional[Callable],
|
|
||||||
):
|
|
||||||
return HaitianCreoleLemmatizer(
|
|
||||||
nlp.vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = ["HaitianCreole"]
|
|
|
@ -1,18 +0,0 @@
|
||||||
"""
|
|
||||||
Example sentences to test spaCy and its language models.
|
|
||||||
|
|
||||||
>>> from spacy.lang.ht.examples import sentences
|
|
||||||
>>> docs = nlp.pipe(sentences)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
sentences = [
|
|
||||||
"Apple ap panse achte yon demaraj nan Wayòm Ini pou $1 milya dola",
|
|
||||||
"Machin otonòm fè responsablite asirans lan ale sou men fabrikan yo",
|
|
||||||
"San Francisco ap konsidere entèdi robo ki livre sou twotwa yo",
|
|
||||||
"Lond se yon gwo vil nan Wayòm Ini",
|
|
||||||
"Kote ou ye?",
|
|
||||||
"Kilès ki prezidan Lafrans?",
|
|
||||||
"Ki kapital Etazini?",
|
|
||||||
"Kile Barack Obama te fèt?",
|
|
||||||
]
|
|
|
@ -1,51 +0,0 @@
|
||||||
from typing import List, Tuple
|
|
||||||
|
|
||||||
from ...pipeline import Lemmatizer
|
|
||||||
from ...tokens import Token
|
|
||||||
from ...lookups import Lookups
|
|
||||||
|
|
||||||
|
|
||||||
class HaitianCreoleLemmatizer(Lemmatizer):
|
|
||||||
"""
|
|
||||||
Minimal Haitian Creole lemmatizer.
|
|
||||||
Returns a word's base form based on rules and lookup,
|
|
||||||
or defaults to the original form.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def is_base_form(self, token: Token) -> bool:
|
|
||||||
morph = token.morph.to_dict()
|
|
||||||
upos = token.pos_.lower()
|
|
||||||
|
|
||||||
# Consider unmarked forms to be base
|
|
||||||
if upos in {"noun", "verb", "adj", "adv"}:
|
|
||||||
if not morph:
|
|
||||||
return True
|
|
||||||
if upos == "noun" and morph.get("Number") == "Sing":
|
|
||||||
return True
|
|
||||||
if upos == "verb" and morph.get("VerbForm") == "Inf":
|
|
||||||
return True
|
|
||||||
if upos == "adj" and morph.get("Degree") == "Pos":
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def rule_lemmatize(self, token: Token) -> List[str]:
|
|
||||||
string = token.text.lower()
|
|
||||||
pos = token.pos_.lower()
|
|
||||||
cache_key = (token.orth, token.pos)
|
|
||||||
if cache_key in self.cache:
|
|
||||||
return self.cache[cache_key]
|
|
||||||
|
|
||||||
forms = []
|
|
||||||
|
|
||||||
# fallback rule: just return lowercased form
|
|
||||||
forms.append(string)
|
|
||||||
|
|
||||||
self.cache[cache_key] = forms
|
|
||||||
return forms
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
|
||||||
if mode == "rule":
|
|
||||||
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
|
||||||
return (required, [])
|
|
||||||
return super().get_lookups_config(mode)
|
|
|
@ -1,78 +0,0 @@
|
||||||
from ...attrs import LIKE_NUM, NORM
|
|
||||||
|
|
||||||
# Cardinal numbers in Creole
|
|
||||||
_num_words = set(
|
|
||||||
"""
|
|
||||||
zewo youn en de twa kat senk sis sèt uit nèf dis
|
|
||||||
onz douz trèz katoz kenz sèz disèt dizwit diznèf
|
|
||||||
vent trant karant sinkant swasant swasann-dis
|
|
||||||
san mil milyon milya
|
|
||||||
""".split()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ordinal numbers in Creole (some are French-influenced, some simplified)
|
|
||||||
_ordinal_words = set(
|
|
||||||
"""
|
|
||||||
premye dezyèm twazyèm katryèm senkyèm sizyèm sètvyèm uitvyèm nèvyèm dizyèm
|
|
||||||
onzèm douzyèm trèzyèm katozyèm kenzèm sèzyèm disetyèm dizwityèm diznèvyèm
|
|
||||||
ventyèm trantyèm karantyèm sinkantyèm swasantyèm
|
|
||||||
swasann-disyèm santyèm milyèm milyonnyèm milyadyèm
|
|
||||||
""".split()
|
|
||||||
)
|
|
||||||
|
|
||||||
NORM_MAP = {
|
|
||||||
"'m": "mwen",
|
|
||||||
"'w": "ou",
|
|
||||||
"'l": "li",
|
|
||||||
"'n": "nou",
|
|
||||||
"'y": "yo",
|
|
||||||
"’m": "mwen",
|
|
||||||
"’w": "ou",
|
|
||||||
"’l": "li",
|
|
||||||
"’n": "nou",
|
|
||||||
"’y": "yo",
|
|
||||||
"m": "mwen",
|
|
||||||
"n": "nou",
|
|
||||||
"l": "li",
|
|
||||||
"y": "yo",
|
|
||||||
"w": "ou",
|
|
||||||
"t": "te",
|
|
||||||
"k": "ki",
|
|
||||||
"p": "pa",
|
|
||||||
"M": "Mwen",
|
|
||||||
"N": "Nou",
|
|
||||||
"L": "Li",
|
|
||||||
"Y": "Yo",
|
|
||||||
"W": "Ou",
|
|
||||||
"T": "Te",
|
|
||||||
"K": "Ki",
|
|
||||||
"P": "Pa",
|
|
||||||
}
|
|
||||||
|
|
||||||
def like_num(text):
|
|
||||||
text = text.strip().lower()
|
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
|
||||||
text = text[1:]
|
|
||||||
text = text.replace(",", "").replace(".", "")
|
|
||||||
if text.isdigit():
|
|
||||||
return True
|
|
||||||
if text.count("/") == 1:
|
|
||||||
num, denom = text.split("/")
|
|
||||||
if num.isdigit() and denom.isdigit():
|
|
||||||
return True
|
|
||||||
if text in _num_words:
|
|
||||||
return True
|
|
||||||
if text in _ordinal_words:
|
|
||||||
return True
|
|
||||||
# Handle things like "3yèm", "10yèm", "25yèm", etc.
|
|
||||||
if text.endswith("yèm") and text[:-3].isdigit():
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def norm_custom(text):
|
|
||||||
return NORM_MAP.get(text, text.lower())
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
|
||||||
LIKE_NUM: like_num,
|
|
||||||
NORM: norm_custom,
|
|
||||||
}
|
|
|
@ -1,43 +0,0 @@
|
||||||
from ..char_classes import (
|
|
||||||
ALPHA,
|
|
||||||
ALPHA_LOWER,
|
|
||||||
ALPHA_UPPER,
|
|
||||||
CONCAT_QUOTES,
|
|
||||||
HYPHENS,
|
|
||||||
LIST_PUNCT,
|
|
||||||
LIST_QUOTES,
|
|
||||||
LIST_ELLIPSES,
|
|
||||||
LIST_ICONS,
|
|
||||||
merge_chars,
|
|
||||||
)
|
|
||||||
|
|
||||||
ELISION = "'’".replace(" ", "")
|
|
||||||
|
|
||||||
_prefixes_elision = "m n l y t k w"
|
|
||||||
_prefixes_elision += " " + _prefixes_elision.upper()
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = LIST_PUNCT + LIST_QUOTES + [
|
|
||||||
r"(?:({pe})[{el}])(?=[{a}])".format(
|
|
||||||
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = LIST_PUNCT + LIST_QUOTES + LIST_ELLIPSES + [
|
|
||||||
r"(?<=[0-9])%", # numbers like 10%
|
|
||||||
r"(?<=[0-9])(?:{h})".format(h=HYPHENS), # hyphens after numbers
|
|
||||||
r"(?<=[{a}])['’]".format(a=ALPHA), # apostrophes after letters
|
|
||||||
r"(?<=[{a}])['’][mwlnytk](?=\s|$)".format(a=ALPHA), # contractions
|
|
||||||
r"(?<=[{a}0-9])\)", # right parenthesis after letter/number
|
|
||||||
r"(?<=[{a}])\.(?=\s|$)".format(a=ALPHA), # period after letter if space or end of string
|
|
||||||
r"(?<=\))[\.\?!]", # punctuation immediately after right parenthesis
|
|
||||||
]
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = LIST_ELLIPSES + LIST_ICONS + [
|
|
||||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
|
||||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
|
||||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
|
||||||
),
|
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
|
||||||
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
|
|
||||||
]
|
|
|
@ -1,50 +0,0 @@
|
||||||
STOP_WORDS = set(
|
|
||||||
"""
|
|
||||||
a ak an ankò ant apre ap atò avan avanlè
|
|
||||||
byen bò byenke
|
|
||||||
|
|
||||||
chak
|
|
||||||
|
|
||||||
de depi deja deja
|
|
||||||
|
|
||||||
e en epi èske
|
|
||||||
|
|
||||||
fò fòk
|
|
||||||
|
|
||||||
gen genyen
|
|
||||||
|
|
||||||
ki kisa kilès kote koukou konsa konbyen konn konnen kounye kouman
|
|
||||||
|
|
||||||
la l laa le lè li lye lò
|
|
||||||
|
|
||||||
m m' mwen
|
|
||||||
|
|
||||||
nan nap nou n'
|
|
||||||
|
|
||||||
ou oumenm
|
|
||||||
|
|
||||||
pa paske pami pandan pito pou pral preske pwiske
|
|
||||||
|
|
||||||
se selman si sou sòt
|
|
||||||
|
|
||||||
ta tap tankou te toujou tou tan tout toutotan twòp tèl
|
|
||||||
|
|
||||||
w w' wi wè
|
|
||||||
|
|
||||||
y y' yo yon yonn
|
|
||||||
|
|
||||||
non o oh eh
|
|
||||||
|
|
||||||
sa san si swa si
|
|
||||||
|
|
||||||
men mèsi oswa osinon
|
|
||||||
|
|
||||||
"""
|
|
||||||
.split()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add common contractions, with and without apostrophe variants
|
|
||||||
contractions = ["m'", "n'", "w'", "y'", "l'", "t'", "k'"]
|
|
||||||
for apostrophe in ["'", "’", "‘"]:
|
|
||||||
for word in contractions:
|
|
||||||
STOP_WORDS.add(word.replace("'", apostrophe))
|
|
|
@ -1,74 +0,0 @@
|
||||||
from typing import Iterator, Tuple, Union
|
|
||||||
|
|
||||||
from ...errors import Errors
|
|
||||||
from ...symbols import NOUN, PRON, PROPN
|
|
||||||
from ...tokens import Doc, Span
|
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]:
|
|
||||||
"""
|
|
||||||
Detect base noun phrases from a dependency parse for Haitian Creole.
|
|
||||||
Works on both Doc and Span objects.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Core nominal dependencies common in Haitian Creole
|
|
||||||
labels = [
|
|
||||||
"nsubj",
|
|
||||||
"obj",
|
|
||||||
"obl",
|
|
||||||
"nmod",
|
|
||||||
"appos",
|
|
||||||
"ROOT",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Modifiers to optionally include in chunk (to the right)
|
|
||||||
post_modifiers = ["compound", "flat", "flat:name", "fixed"]
|
|
||||||
|
|
||||||
doc = doclike.doc
|
|
||||||
if not doc.has_annotation("DEP"):
|
|
||||||
raise ValueError(Errors.E029)
|
|
||||||
|
|
||||||
np_deps = {doc.vocab.strings.add(label) for label in labels}
|
|
||||||
np_mods = {doc.vocab.strings.add(mod) for mod in post_modifiers}
|
|
||||||
conj_label = doc.vocab.strings.add("conj")
|
|
||||||
np_label = doc.vocab.strings.add("NP")
|
|
||||||
adp_pos = doc.vocab.strings.add("ADP")
|
|
||||||
cc_pos = doc.vocab.strings.add("CCONJ")
|
|
||||||
|
|
||||||
prev_end = -1
|
|
||||||
for i, word in enumerate(doclike):
|
|
||||||
if word.pos not in (NOUN, PROPN, PRON):
|
|
||||||
continue
|
|
||||||
if word.left_edge.i <= prev_end:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if word.dep in np_deps:
|
|
||||||
right_end = word
|
|
||||||
# expand to include known modifiers to the right
|
|
||||||
for child in word.rights:
|
|
||||||
if child.dep in np_mods:
|
|
||||||
right_end = child.right_edge
|
|
||||||
elif child.pos == NOUN:
|
|
||||||
right_end = child.right_edge
|
|
||||||
|
|
||||||
left_index = word.left_edge.i
|
|
||||||
# Skip prepositions at the start
|
|
||||||
if word.left_edge.pos == adp_pos:
|
|
||||||
left_index += 1
|
|
||||||
|
|
||||||
prev_end = right_end.i
|
|
||||||
yield left_index, right_end.i + 1, np_label
|
|
||||||
|
|
||||||
elif word.dep == conj_label:
|
|
||||||
head = word.head
|
|
||||||
while head.dep == conj_label and head.head.i < head.i:
|
|
||||||
head = head.head
|
|
||||||
if head.dep in np_deps:
|
|
||||||
left_index = word.left_edge.i
|
|
||||||
if word.left_edge.pos == cc_pos:
|
|
||||||
left_index += 1
|
|
||||||
prev_end = word.i
|
|
||||||
yield left_index, word.i + 1, np_label
|
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
|
|
@ -1,21 +0,0 @@
|
||||||
from spacy.symbols import NOUN, VERB, AUX, ADJ, ADV, PRON, DET, ADP, SCONJ, CCONJ, PART, INTJ, NUM, PROPN, PUNCT, SYM, X
|
|
||||||
|
|
||||||
TAG_MAP = {
|
|
||||||
"NOUN": {"pos": NOUN},
|
|
||||||
"VERB": {"pos": VERB},
|
|
||||||
"AUX": {"pos": AUX},
|
|
||||||
"ADJ": {"pos": ADJ},
|
|
||||||
"ADV": {"pos": ADV},
|
|
||||||
"PRON": {"pos": PRON},
|
|
||||||
"DET": {"pos": DET},
|
|
||||||
"ADP": {"pos": ADP},
|
|
||||||
"SCONJ": {"pos": SCONJ},
|
|
||||||
"CCONJ": {"pos": CCONJ},
|
|
||||||
"PART": {"pos": PART},
|
|
||||||
"INTJ": {"pos": INTJ},
|
|
||||||
"NUM": {"pos": NUM},
|
|
||||||
"PROPN": {"pos": PROPN},
|
|
||||||
"PUNCT": {"pos": PUNCT},
|
|
||||||
"SYM": {"pos": SYM},
|
|
||||||
"X": {"pos": X},
|
|
||||||
}
|
|
|
@ -1,121 +0,0 @@
|
||||||
from spacy.symbols import ORTH, NORM
|
|
||||||
|
|
||||||
def make_variants(base, first_norm, second_orth, second_norm):
|
|
||||||
return {
|
|
||||||
base: [
|
|
||||||
{ORTH: base.split("'")[0] + "'", NORM: first_norm},
|
|
||||||
{ORTH: second_orth, NORM: second_norm},
|
|
||||||
],
|
|
||||||
base.capitalize(): [
|
|
||||||
{ORTH: base.split("'")[0].capitalize() + "'", NORM: first_norm.capitalize()},
|
|
||||||
{ORTH: second_orth, NORM: second_norm},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
|
||||||
"Dr.": [{ORTH: "Dr."}]
|
|
||||||
}
|
|
||||||
|
|
||||||
# Apostrophe forms
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'ap", "mwen", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("n'ap", "nou", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("l'ap", "li", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("y'ap", "yo", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'te", "mwen", "te", "te"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("m'pral", "mwen", "pral", "pral"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("w'ap", "ou", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("k'ap", "ki", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("p'ap", "pa", "ap", "ap"))
|
|
||||||
TOKENIZER_EXCEPTIONS.update(make_variants("t'ap", "te", "ap", "ap"))
|
|
||||||
|
|
||||||
# Non-apostrophe contractions (with capitalized variants)
|
|
||||||
TOKENIZER_EXCEPTIONS.update({
|
|
||||||
"map": [
|
|
||||||
{ORTH: "m", NORM: "mwen"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Map": [
|
|
||||||
{ORTH: "M", NORM: "Mwen"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"lem": [
|
|
||||||
{ORTH: "le", NORM: "le"},
|
|
||||||
{ORTH: "m", NORM: "mwen"},
|
|
||||||
],
|
|
||||||
"Lem": [
|
|
||||||
{ORTH: "Le", NORM: "Le"},
|
|
||||||
{ORTH: "m", NORM: "mwen"},
|
|
||||||
],
|
|
||||||
"lew": [
|
|
||||||
{ORTH: "le", NORM: "le"},
|
|
||||||
{ORTH: "w", NORM: "ou"},
|
|
||||||
],
|
|
||||||
"Lew": [
|
|
||||||
{ORTH: "Le", NORM: "Le"},
|
|
||||||
{ORTH: "w", NORM: "ou"},
|
|
||||||
],
|
|
||||||
"nap": [
|
|
||||||
{ORTH: "n", NORM: "nou"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Nap": [
|
|
||||||
{ORTH: "N", NORM: "Nou"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"lap": [
|
|
||||||
{ORTH: "l", NORM: "li"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Lap": [
|
|
||||||
{ORTH: "L", NORM: "Li"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"yap": [
|
|
||||||
{ORTH: "y", NORM: "yo"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Yap": [
|
|
||||||
{ORTH: "Y", NORM: "Yo"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"mte": [
|
|
||||||
{ORTH: "m", NORM: "mwen"},
|
|
||||||
{ORTH: "te", NORM: "te"},
|
|
||||||
],
|
|
||||||
"Mte": [
|
|
||||||
{ORTH: "M", NORM: "Mwen"},
|
|
||||||
{ORTH: "te", NORM: "te"},
|
|
||||||
],
|
|
||||||
"mpral": [
|
|
||||||
{ORTH: "m", NORM: "mwen"},
|
|
||||||
{ORTH: "pral", NORM: "pral"},
|
|
||||||
],
|
|
||||||
"Mpral": [
|
|
||||||
{ORTH: "M", NORM: "Mwen"},
|
|
||||||
{ORTH: "pral", NORM: "pral"},
|
|
||||||
],
|
|
||||||
"wap": [
|
|
||||||
{ORTH: "w", NORM: "ou"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Wap": [
|
|
||||||
{ORTH: "W", NORM: "Ou"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"kap": [
|
|
||||||
{ORTH: "k", NORM: "ki"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Kap": [
|
|
||||||
{ORTH: "K", NORM: "Ki"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"tap": [
|
|
||||||
{ORTH: "t", NORM: "te"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
"Tap": [
|
|
||||||
{ORTH: "T", NORM: "Te"},
|
|
||||||
{ORTH: "ap", NORM: "ap"},
|
|
||||||
],
|
|
||||||
})
|
|
|
@ -32,6 +32,7 @@ split_mode = null
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.ja.JapaneseTokenizer")
|
||||||
def create_tokenizer(split_mode: Optional[str] = None):
|
def create_tokenizer(split_mode: Optional[str] = None):
|
||||||
def japanese_tokenizer_factory(nlp):
|
def japanese_tokenizer_factory(nlp):
|
||||||
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
return JapaneseTokenizer(nlp.vocab, split_mode=split_mode)
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
from ...language import BaseDefaults, Language
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
class KurmanjiDefaults(BaseDefaults):
|
|
||||||
stop_words = STOP_WORDS
|
|
||||||
lex_attr_getters = LEX_ATTRS
|
|
||||||
|
|
||||||
|
|
||||||
class Kurmanji(Language):
|
|
||||||
lang = "kmr"
|
|
||||||
Defaults = KurmanjiDefaults
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Kurmanji"]
|
|
|
@ -1,17 +0,0 @@
|
||||||
"""
|
|
||||||
Example sentences to test spaCy and its language models.
|
|
||||||
|
|
||||||
>>> from spacy.lang.kmr.examples import sentences
|
|
||||||
>>> docs = nlp.pipe(sentences)
|
|
||||||
"""
|
|
||||||
|
|
||||||
sentences = [
|
|
||||||
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
|
|
||||||
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
|
|
||||||
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
|
|
||||||
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
|
|
||||||
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
|
|
||||||
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
|
|
||||||
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
|
|
||||||
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
|
|
||||||
]
|
|
|
@ -1,138 +0,0 @@
|
||||||
from ...attrs import LIKE_NUM
|
|
||||||
|
|
||||||
_num_words = [
|
|
||||||
"sifir",
|
|
||||||
"yek",
|
|
||||||
"du",
|
|
||||||
"sê",
|
|
||||||
"çar",
|
|
||||||
"pênc",
|
|
||||||
"şeş",
|
|
||||||
"heft",
|
|
||||||
"heşt",
|
|
||||||
"neh",
|
|
||||||
"deh",
|
|
||||||
"yazde",
|
|
||||||
"dazde",
|
|
||||||
"sêzde",
|
|
||||||
"çarde",
|
|
||||||
"pazde",
|
|
||||||
"şazde",
|
|
||||||
"hevde",
|
|
||||||
"hejde",
|
|
||||||
"nozde",
|
|
||||||
"bîst",
|
|
||||||
"sî",
|
|
||||||
"çil",
|
|
||||||
"pêncî",
|
|
||||||
"şêst",
|
|
||||||
"heftê",
|
|
||||||
"heştê",
|
|
||||||
"nod",
|
|
||||||
"sed",
|
|
||||||
"hezar",
|
|
||||||
"milyon",
|
|
||||||
"milyar",
|
|
||||||
]
|
|
||||||
|
|
||||||
_ordinal_words = [
|
|
||||||
"yekem",
|
|
||||||
"yekemîn",
|
|
||||||
"duyem",
|
|
||||||
"duyemîn",
|
|
||||||
"sêyem",
|
|
||||||
"sêyemîn",
|
|
||||||
"çarem",
|
|
||||||
"çaremîn",
|
|
||||||
"pêncem",
|
|
||||||
"pêncemîn",
|
|
||||||
"şeşem",
|
|
||||||
"şeşemîn",
|
|
||||||
"heftem",
|
|
||||||
"heftemîn",
|
|
||||||
"heştem",
|
|
||||||
"heştemîn",
|
|
||||||
"nehem",
|
|
||||||
"nehemîn",
|
|
||||||
"dehem",
|
|
||||||
"dehemîn",
|
|
||||||
"yazdehem",
|
|
||||||
"yazdehemîn",
|
|
||||||
"dazdehem",
|
|
||||||
"dazdehemîn",
|
|
||||||
"sêzdehem",
|
|
||||||
"sêzdehemîn",
|
|
||||||
"çardehem",
|
|
||||||
"çardehemîn",
|
|
||||||
"pazdehem",
|
|
||||||
"pazdehemîn",
|
|
||||||
"şanzdehem",
|
|
||||||
"şanzdehemîn",
|
|
||||||
"hevdehem",
|
|
||||||
"hevdehemîn",
|
|
||||||
"hejdehem",
|
|
||||||
"hejdehemîn",
|
|
||||||
"nozdehem",
|
|
||||||
"nozdehemîn",
|
|
||||||
"bîstem",
|
|
||||||
"bîstemîn",
|
|
||||||
"sîyem",
|
|
||||||
"sîyemîn",
|
|
||||||
"çilem",
|
|
||||||
"çilemîn",
|
|
||||||
"pêncîyem",
|
|
||||||
"pênciyemîn",
|
|
||||||
"şêstem",
|
|
||||||
"şêstemîn",
|
|
||||||
"heftêyem",
|
|
||||||
"heftêyemîn",
|
|
||||||
"heştêyem",
|
|
||||||
"heştêyemîn",
|
|
||||||
"notem",
|
|
||||||
"notemîn",
|
|
||||||
"sedem",
|
|
||||||
"sedemîn",
|
|
||||||
"hezarem",
|
|
||||||
"hezaremîn",
|
|
||||||
"milyonem",
|
|
||||||
"milyonemîn",
|
|
||||||
"milyarem",
|
|
||||||
"milyaremîn",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
|
||||||
if text.startswith(("+", "-", "±", "~")):
|
|
||||||
text = text[1:]
|
|
||||||
text = text.replace(",", "").replace(".", "")
|
|
||||||
if text.isdigit():
|
|
||||||
return True
|
|
||||||
if text.count("/") == 1:
|
|
||||||
num, denom = text.split("/")
|
|
||||||
if num.isdigit() and denom.isdigit():
|
|
||||||
return True
|
|
||||||
text_lower = text.lower()
|
|
||||||
if text_lower in _num_words:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check ordinal number
|
|
||||||
if text_lower in _ordinal_words:
|
|
||||||
return True
|
|
||||||
|
|
||||||
if is_digit(text_lower):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_digit(text):
|
|
||||||
endings = ("em", "yem", "emîn", "yemîn")
|
|
||||||
for ending in endings:
|
|
||||||
to = len(ending)
|
|
||||||
if text.endswith(ending) and text[:-to].isdigit():
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
|
@ -1,44 +0,0 @@
|
||||||
STOP_WORDS = set(
|
|
||||||
"""
|
|
||||||
û
|
|
||||||
li
|
|
||||||
bi
|
|
||||||
di
|
|
||||||
da
|
|
||||||
de
|
|
||||||
ji
|
|
||||||
ku
|
|
||||||
ew
|
|
||||||
ez
|
|
||||||
tu
|
|
||||||
em
|
|
||||||
hûn
|
|
||||||
ew
|
|
||||||
ev
|
|
||||||
min
|
|
||||||
te
|
|
||||||
wî
|
|
||||||
wê
|
|
||||||
me
|
|
||||||
we
|
|
||||||
wan
|
|
||||||
vê
|
|
||||||
vî
|
|
||||||
va
|
|
||||||
çi
|
|
||||||
kî
|
|
||||||
kê
|
|
||||||
çawa
|
|
||||||
çima
|
|
||||||
kengî
|
|
||||||
li ku
|
|
||||||
çend
|
|
||||||
çiqas
|
|
||||||
her
|
|
||||||
hin
|
|
||||||
gelek
|
|
||||||
hemû
|
|
||||||
kes
|
|
||||||
tişt
|
|
||||||
""".split()
|
|
||||||
)
|
|
|
@ -20,6 +20,7 @@ DEFAULT_CONFIG = """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||||
def create_tokenizer():
|
def create_tokenizer():
|
||||||
def korean_tokenizer_factory(nlp):
|
def korean_tokenizer_factory(nlp):
|
||||||
return KoreanTokenizer(nlp.vocab)
|
return KoreanTokenizer(nlp.vocab)
|
||||||
|
|
|
@ -24,6 +24,12 @@ class MacedonianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||||
|
if lookups is None:
|
||||||
|
lookups = Lookups()
|
||||||
|
return MacedonianLemmatizer(lookups)
|
||||||
|
|
||||||
|
|
||||||
class Macedonian(Language):
|
class Macedonian(Language):
|
||||||
lang = "mk"
|
lang = "mk"
|
||||||
|
|
|
@ -1,20 +0,0 @@
|
||||||
from ...language import BaseDefaults, Language
|
|
||||||
from ..nb import SYNTAX_ITERATORS
|
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|
||||||
|
|
||||||
|
|
||||||
class NorwegianNynorskDefaults(BaseDefaults):
|
|
||||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
|
||||||
prefixes = TOKENIZER_PREFIXES
|
|
||||||
infixes = TOKENIZER_INFIXES
|
|
||||||
suffixes = TOKENIZER_SUFFIXES
|
|
||||||
syntax_iterators = SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
|
|
||||||
class NorwegianNynorsk(Language):
|
|
||||||
lang = "nn"
|
|
||||||
Defaults = NorwegianNynorskDefaults
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["NorwegianNynorsk"]
|
|
|
@ -1,15 +0,0 @@
|
||||||
"""
|
|
||||||
Example sentences to test spaCy and its language models.
|
|
||||||
|
|
||||||
>>> from spacy.lang.nn.examples import sentences
|
|
||||||
>>> docs = nlp.pipe(sentences)
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/)
|
|
||||||
sentences = [
|
|
||||||
"Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.",
|
|
||||||
"Det er ein meir enn i same periode i fjor.",
|
|
||||||
"Det har lava ned enorme snømengder i store delar av Europa den siste tida.",
|
|
||||||
"Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.",
|
|
||||||
]
|
|
|
@ -1,74 +0,0 @@
|
||||||
from ..char_classes import (
|
|
||||||
ALPHA,
|
|
||||||
ALPHA_LOWER,
|
|
||||||
ALPHA_UPPER,
|
|
||||||
CONCAT_QUOTES,
|
|
||||||
CURRENCY,
|
|
||||||
LIST_CURRENCY,
|
|
||||||
LIST_ELLIPSES,
|
|
||||||
LIST_ICONS,
|
|
||||||
LIST_PUNCT,
|
|
||||||
LIST_QUOTES,
|
|
||||||
PUNCT,
|
|
||||||
UNITS,
|
|
||||||
)
|
|
||||||
from ..punctuation import TOKENIZER_SUFFIXES
|
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
|
||||||
_list_punct = [x for x in LIST_PUNCT if x != "#"]
|
|
||||||
_list_icons = [x for x in LIST_ICONS if x != "°"]
|
|
||||||
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
|
|
||||||
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
|
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (
|
|
||||||
["§", "%", "=", "—", "–", r"\+(?![0-9])"]
|
|
||||||
+ _list_punct
|
|
||||||
+ LIST_ELLIPSES
|
|
||||||
+ LIST_QUOTES
|
|
||||||
+ LIST_CURRENCY
|
|
||||||
+ LIST_ICONS
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_infixes = (
|
|
||||||
LIST_ELLIPSES
|
|
||||||
+ _list_icons
|
|
||||||
+ [
|
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
_suffixes = (
|
|
||||||
LIST_PUNCT
|
|
||||||
+ LIST_ELLIPSES
|
|
||||||
+ _list_quotes
|
|
||||||
+ _list_icons
|
|
||||||
+ ["—", "–"]
|
|
||||||
+ [
|
|
||||||
r"(?<=[0-9])\+",
|
|
||||||
r"(?<=°[FfCcKk])\.",
|
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
|
||||||
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
|
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
|
|
||||||
),
|
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
|
||||||
]
|
|
||||||
+ [r"(?<=[^sSxXzZ])'"]
|
|
||||||
)
|
|
||||||
_suffixes += [
|
|
||||||
suffix
|
|
||||||
for suffix in TOKENIZER_SUFFIXES
|
|
||||||
if suffix not in ["'s", "'S", "’s", "’S", r"\'"]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_PREFIXES = _prefixes
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
|
||||||
TOKENIZER_SUFFIXES = _suffixes
|
|
|
@ -1,228 +0,0 @@
|
||||||
from ...symbols import NORM, ORTH
|
|
||||||
from ...util import update_exc
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
||||||
|
|
||||||
_exc = {}
|
|
||||||
|
|
||||||
|
|
||||||
for exc_data in [
|
|
||||||
{ORTH: "jan.", NORM: "januar"},
|
|
||||||
{ORTH: "feb.", NORM: "februar"},
|
|
||||||
{ORTH: "mar.", NORM: "mars"},
|
|
||||||
{ORTH: "apr.", NORM: "april"},
|
|
||||||
{ORTH: "jun.", NORM: "juni"},
|
|
||||||
# note: "jul." is in the simple list below without a NORM exception
|
|
||||||
{ORTH: "aug.", NORM: "august"},
|
|
||||||
{ORTH: "sep.", NORM: "september"},
|
|
||||||
{ORTH: "okt.", NORM: "oktober"},
|
|
||||||
{ORTH: "nov.", NORM: "november"},
|
|
||||||
{ORTH: "des.", NORM: "desember"},
|
|
||||||
]:
|
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
|
||||||
|
|
||||||
|
|
||||||
for orth in [
|
|
||||||
"Ap.",
|
|
||||||
"Aq.",
|
|
||||||
"Ca.",
|
|
||||||
"Chr.",
|
|
||||||
"Co.",
|
|
||||||
"Dr.",
|
|
||||||
"F.eks.",
|
|
||||||
"Fr.p.",
|
|
||||||
"Frp.",
|
|
||||||
"Grl.",
|
|
||||||
"Kr.",
|
|
||||||
"Kr.F.",
|
|
||||||
"Kr.F.s",
|
|
||||||
"Mr.",
|
|
||||||
"Mrs.",
|
|
||||||
"Pb.",
|
|
||||||
"Pr.",
|
|
||||||
"Sp.",
|
|
||||||
"St.",
|
|
||||||
"a.m.",
|
|
||||||
"ad.",
|
|
||||||
"adm.dir.",
|
|
||||||
"adr.",
|
|
||||||
"b.c.",
|
|
||||||
"bl.a.",
|
|
||||||
"bla.",
|
|
||||||
"bm.",
|
|
||||||
"bnr.",
|
|
||||||
"bto.",
|
|
||||||
"c.c.",
|
|
||||||
"ca.",
|
|
||||||
"cand.mag.",
|
|
||||||
"co.",
|
|
||||||
"d.d.",
|
|
||||||
"d.m.",
|
|
||||||
"d.y.",
|
|
||||||
"dept.",
|
|
||||||
"dr.",
|
|
||||||
"dr.med.",
|
|
||||||
"dr.philos.",
|
|
||||||
"dr.psychol.",
|
|
||||||
"dss.",
|
|
||||||
"dvs.",
|
|
||||||
"e.Kr.",
|
|
||||||
"e.l.",
|
|
||||||
"eg.",
|
|
||||||
"eig.",
|
|
||||||
"ekskl.",
|
|
||||||
"el.",
|
|
||||||
"et.",
|
|
||||||
"etc.",
|
|
||||||
"etg.",
|
|
||||||
"ev.",
|
|
||||||
"evt.",
|
|
||||||
"f.",
|
|
||||||
"f.Kr.",
|
|
||||||
"f.eks.",
|
|
||||||
"f.o.m.",
|
|
||||||
"fhv.",
|
|
||||||
"fk.",
|
|
||||||
"foreg.",
|
|
||||||
"fork.",
|
|
||||||
"fv.",
|
|
||||||
"fvt.",
|
|
||||||
"g.",
|
|
||||||
"gl.",
|
|
||||||
"gno.",
|
|
||||||
"gnr.",
|
|
||||||
"grl.",
|
|
||||||
"gt.",
|
|
||||||
"h.r.adv.",
|
|
||||||
"hhv.",
|
|
||||||
"hoh.",
|
|
||||||
"hr.",
|
|
||||||
"ifb.",
|
|
||||||
"ifm.",
|
|
||||||
"iht.",
|
|
||||||
"inkl.",
|
|
||||||
"istf.",
|
|
||||||
"jf.",
|
|
||||||
"jr.",
|
|
||||||
"jul.",
|
|
||||||
"juris.",
|
|
||||||
"kfr.",
|
|
||||||
"kgl.",
|
|
||||||
"kgl.res.",
|
|
||||||
"kl.",
|
|
||||||
"komm.",
|
|
||||||
"kr.",
|
|
||||||
"kst.",
|
|
||||||
"lat.",
|
|
||||||
"lø.",
|
|
||||||
"m.a.",
|
|
||||||
"m.a.o.",
|
|
||||||
"m.fl.",
|
|
||||||
"m.m.",
|
|
||||||
"m.v.",
|
|
||||||
"ma.",
|
|
||||||
"mag.art.",
|
|
||||||
"md.",
|
|
||||||
"mfl.",
|
|
||||||
"mht.",
|
|
||||||
"mill.",
|
|
||||||
"min.",
|
|
||||||
"mnd.",
|
|
||||||
"moh.",
|
|
||||||
"mrd.",
|
|
||||||
"muh.",
|
|
||||||
"mv.",
|
|
||||||
"mva.",
|
|
||||||
"n.å.",
|
|
||||||
"ndf.",
|
|
||||||
"nr.",
|
|
||||||
"nto.",
|
|
||||||
"nyno.",
|
|
||||||
"o.a.",
|
|
||||||
"o.l.",
|
|
||||||
"obl.",
|
|
||||||
"off.",
|
|
||||||
"ofl.",
|
|
||||||
"on.",
|
|
||||||
"op.",
|
|
||||||
"org.",
|
|
||||||
"osv.",
|
|
||||||
"ovf.",
|
|
||||||
"p.",
|
|
||||||
"p.a.",
|
|
||||||
"p.g.a.",
|
|
||||||
"p.m.",
|
|
||||||
"p.t.",
|
|
||||||
"pga.",
|
|
||||||
"ph.d.",
|
|
||||||
"pkt.",
|
|
||||||
"pr.",
|
|
||||||
"pst.",
|
|
||||||
"pt.",
|
|
||||||
"red.anm.",
|
|
||||||
"ref.",
|
|
||||||
"res.",
|
|
||||||
"res.kap.",
|
|
||||||
"resp.",
|
|
||||||
"rv.",
|
|
||||||
"s.",
|
|
||||||
"s.d.",
|
|
||||||
"s.k.",
|
|
||||||
"s.u.",
|
|
||||||
"s.å.",
|
|
||||||
"sen.",
|
|
||||||
"sep.",
|
|
||||||
"siviling.",
|
|
||||||
"sms.",
|
|
||||||
"snr.",
|
|
||||||
"spm.",
|
|
||||||
"sr.",
|
|
||||||
"sst.",
|
|
||||||
"st.",
|
|
||||||
"st.meld.",
|
|
||||||
"st.prp.",
|
|
||||||
"stip.",
|
|
||||||
"stk.",
|
|
||||||
"stud.",
|
|
||||||
"sv.",
|
|
||||||
"såk.",
|
|
||||||
"sø.",
|
|
||||||
"t.d.",
|
|
||||||
"t.h.",
|
|
||||||
"t.o.m.",
|
|
||||||
"t.v.",
|
|
||||||
"temp.",
|
|
||||||
"ti.",
|
|
||||||
"tils.",
|
|
||||||
"tilsv.",
|
|
||||||
"tl;dr",
|
|
||||||
"tlf.",
|
|
||||||
"to.",
|
|
||||||
"ult.",
|
|
||||||
"utg.",
|
|
||||||
"v.",
|
|
||||||
"vedk.",
|
|
||||||
"vedr.",
|
|
||||||
"vg.",
|
|
||||||
"vgs.",
|
|
||||||
"vha.",
|
|
||||||
"vit.ass.",
|
|
||||||
"vn.",
|
|
||||||
"vol.",
|
|
||||||
"vs.",
|
|
||||||
"vsa.",
|
|
||||||
"§§",
|
|
||||||
"©NTB",
|
|
||||||
"årg.",
|
|
||||||
"årh.",
|
|
||||||
]:
|
|
||||||
_exc[orth] = [{ORTH: orth}]
|
|
||||||
|
|
||||||
# Dates
|
|
||||||
for h in range(1, 31 + 1):
|
|
||||||
for period in ["."]:
|
|
||||||
_exc[f"{h}{period}"] = [{ORTH: f"{h}."}]
|
|
||||||
|
|
||||||
_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]}
|
|
||||||
_exc.update(_custom_base_exc)
|
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
|
|
@ -13,6 +13,7 @@ DEFAULT_CONFIG = """
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.th.ThaiTokenizer")
|
||||||
def create_thai_tokenizer():
|
def create_thai_tokenizer():
|
||||||
def thai_tokenizer_factory(nlp):
|
def thai_tokenizer_factory(nlp):
|
||||||
return ThaiTokenizer(nlp.vocab)
|
return ThaiTokenizer(nlp.vocab)
|
||||||
|
|
|
@ -22,6 +22,7 @@ use_pyvi = true
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
|
||||||
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
def create_vietnamese_tokenizer(use_pyvi: bool = True):
|
||||||
def vietnamese_tokenizer_factory(nlp):
|
def vietnamese_tokenizer_factory(nlp):
|
||||||
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
|
||||||
|
|
|
@ -46,6 +46,7 @@ class Segmenter(str, Enum):
|
||||||
return list(cls.__members__.keys())
|
return list(cls.__members__.keys())
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.zh.ChineseTokenizer")
|
||||||
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
def create_chinese_tokenizer(segmenter: Segmenter = Segmenter.char):
|
||||||
def chinese_tokenizer_factory(nlp):
|
def chinese_tokenizer_factory(nlp):
|
||||||
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
return ChineseTokenizer(nlp.vocab, segmenter=segmenter)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import multiprocessing as mp
|
||||||
import random
|
import random
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import ExitStack, contextmanager
|
from contextlib import contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -30,11 +30,8 @@ from typing import (
|
||||||
overload,
|
overload,
|
||||||
)
|
)
|
||||||
|
|
||||||
import numpy
|
|
||||||
import srsly
|
import srsly
|
||||||
from cymem.cymem import Pool
|
|
||||||
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
||||||
from thinc.util import convert_recursive
|
|
||||||
|
|
||||||
from . import about, ty, util
|
from . import about, ty, util
|
||||||
from .compat import Literal
|
from .compat import Literal
|
||||||
|
@ -104,6 +101,7 @@ class BaseDefaults:
|
||||||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.Tokenizer.v1")
|
||||||
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
"""Registered function to create a tokenizer. Returns a factory that takes
|
"""Registered function to create a tokenizer. Returns a factory that takes
|
||||||
the nlp object and returns a Tokenizer instance using the language detaults.
|
the nlp object and returns a Tokenizer instance using the language detaults.
|
||||||
|
@ -129,6 +127,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
return tokenizer_factory
|
return tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||||
def load_lookups_data(lang, tables):
|
def load_lookups_data(lang, tables):
|
||||||
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||||
lookups = load_lookups(lang=lang, tables=tables)
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
|
@ -141,7 +140,7 @@ class Language:
|
||||||
|
|
||||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||||
object and processing pipeline.
|
object and processing pipeline.
|
||||||
lang (str): Two-letter ISO 639-1 or three-letter ISO 639-3 language codes, such as 'en' and 'eng'.
|
lang (str): IETF language code, such as 'en'.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language
|
DOCS: https://spacy.io/api/language
|
||||||
"""
|
"""
|
||||||
|
@ -183,9 +182,6 @@ class Language:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#init
|
DOCS: https://spacy.io/api/language#init
|
||||||
"""
|
"""
|
||||||
from .pipeline.factories import register_factories
|
|
||||||
|
|
||||||
register_factories()
|
|
||||||
# We're only calling this to import all factories provided via entry
|
# We're only calling this to import all factories provided via entry
|
||||||
# points. The factory decorator applied to these functions takes care
|
# points. The factory decorator applied to these functions takes care
|
||||||
# of the rest.
|
# of the rest.
|
||||||
|
@ -1215,7 +1211,7 @@ class Language:
|
||||||
examples,
|
examples,
|
||||||
):
|
):
|
||||||
eg.predicted = doc
|
eg.predicted = doc
|
||||||
return _replace_numpy_floats(losses)
|
return losses
|
||||||
|
|
||||||
def rehearse(
|
def rehearse(
|
||||||
self,
|
self,
|
||||||
|
@ -1466,7 +1462,7 @@ class Language:
|
||||||
results = scorer.score(examples, per_component=per_component)
|
results = scorer.score(examples, per_component=per_component)
|
||||||
n_words = sum(len(eg.predicted) for eg in examples)
|
n_words = sum(len(eg.predicted) for eg in examples)
|
||||||
results["speed"] = n_words / (end_time - start_time)
|
results["speed"] = n_words / (end_time - start_time)
|
||||||
return _replace_numpy_floats(results)
|
return results
|
||||||
|
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
"""Create an optimizer, usually using the [training.optimizer] config."""
|
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||||||
|
@ -1687,12 +1683,6 @@ class Language:
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.start()
|
proc.start()
|
||||||
|
|
||||||
# Close writing-end of channels. This is needed to avoid that reading
|
|
||||||
# from the channel blocks indefinitely when the worker closes the
|
|
||||||
# channel.
|
|
||||||
for tx in bytedocs_send_ch:
|
|
||||||
tx.close()
|
|
||||||
|
|
||||||
# Cycle channels not to break the order of docs.
|
# Cycle channels not to break the order of docs.
|
||||||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||||||
byte_tuples = chain.from_iterable(
|
byte_tuples = chain.from_iterable(
|
||||||
|
@ -1715,27 +1705,8 @@ class Language:
|
||||||
# tell `sender` that one batch was consumed.
|
# tell `sender` that one batch was consumed.
|
||||||
sender.step()
|
sender.step()
|
||||||
finally:
|
finally:
|
||||||
# If we are stopping in an orderly fashion, the workers' queues
|
|
||||||
# are empty. Put the sentinel in their queues to signal that work
|
|
||||||
# is done, so that they can exit gracefully.
|
|
||||||
for q in texts_q:
|
|
||||||
q.put(_WORK_DONE_SENTINEL)
|
|
||||||
q.close()
|
|
||||||
|
|
||||||
# Otherwise, we are stopping because the error handler raised an
|
|
||||||
# exception. The sentinel will be last to go out of the queue.
|
|
||||||
# To avoid doing unnecessary work or hanging on platforms that
|
|
||||||
# block on sending (Windows), we'll close our end of the channel.
|
|
||||||
# This signals to the worker that it can exit the next time it
|
|
||||||
# attempts to send data down the channel.
|
|
||||||
for r in bytedocs_recv_ch:
|
|
||||||
r.close()
|
|
||||||
|
|
||||||
for proc in procs:
|
for proc in procs:
|
||||||
proc.join()
|
proc.terminate()
|
||||||
|
|
||||||
if not all(proc.exitcode == 0 for proc in procs):
|
|
||||||
warnings.warn(Warnings.W127)
|
|
||||||
|
|
||||||
def _link_components(self) -> None:
|
def _link_components(self) -> None:
|
||||||
"""Register 'listeners' within pipeline components, to allow them to
|
"""Register 'listeners' within pipeline components, to allow them to
|
||||||
|
@ -2095,38 +2066,6 @@ class Language:
|
||||||
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
||||||
tok2vec.remove_listener(listener, pipe_name)
|
tok2vec.remove_listener(listener, pipe_name)
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
|
||||||
"""Begin a block where all resources allocated during the block will
|
|
||||||
be freed at the end of it. If a resources was created within the
|
|
||||||
memory zone block, accessing it outside the block is invalid.
|
|
||||||
Behaviour of this invalid access is undefined. Memory zones should
|
|
||||||
not be nested.
|
|
||||||
|
|
||||||
The memory zone is helpful for services that need to process large
|
|
||||||
volumes of text with a defined memory budget.
|
|
||||||
|
|
||||||
Example
|
|
||||||
-------
|
|
||||||
>>> with nlp.memory_zone():
|
|
||||||
... for doc in nlp.pipe(texts):
|
|
||||||
... process_my_doc(doc)
|
|
||||||
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
|
|
||||||
"""
|
|
||||||
if mem is None:
|
|
||||||
mem = Pool()
|
|
||||||
# The ExitStack allows programmatic nested context managers.
|
|
||||||
# We don't know how many we need, so it would be awkward to have
|
|
||||||
# them as nested blocks.
|
|
||||||
with ExitStack() as stack:
|
|
||||||
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
|
|
||||||
if hasattr(self.tokenizer, "memory_zone"):
|
|
||||||
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
|
|
||||||
for _, pipe in self.pipeline:
|
|
||||||
if hasattr(pipe, "memory_zone"):
|
|
||||||
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
|
|
||||||
yield mem
|
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> None:
|
) -> None:
|
||||||
|
@ -2144,9 +2083,7 @@ class Language:
|
||||||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
|
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
|
||||||
p, exclude=["vocab"]
|
p, exclude=["vocab"]
|
||||||
)
|
)
|
||||||
serializers["meta.json"] = lambda p: srsly.write_json(
|
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
|
||||||
p, _replace_numpy_floats(self.meta)
|
|
||||||
)
|
|
||||||
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
||||||
for name, proc in self._components:
|
for name, proc in self._components:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
|
@ -2260,9 +2197,7 @@ class Language:
|
||||||
serializers: Dict[str, Callable[[], bytes]] = {}
|
serializers: Dict[str, Callable[[], bytes]] = {}
|
||||||
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||||||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
|
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
|
||||||
serializers["meta.json"] = lambda: srsly.json_dumps(
|
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||||||
_replace_numpy_floats(self.meta)
|
|
||||||
)
|
|
||||||
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
||||||
for name, proc in self._components:
|
for name, proc in self._components:
|
||||||
if name in exclude:
|
if name in exclude:
|
||||||
|
@ -2313,12 +2248,6 @@ class Language:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def _replace_numpy_floats(meta_dict: dict) -> dict:
|
|
||||||
return convert_recursive(
|
|
||||||
lambda v: isinstance(v, numpy.floating), lambda v: float(v), dict(meta_dict)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FactoryMeta:
|
class FactoryMeta:
|
||||||
"""Dataclass containing information about a component and its defaults
|
"""Dataclass containing information about a component and its defaults
|
||||||
|
@ -2394,13 +2323,6 @@ def _apply_pipes(
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
texts_with_ctx = receiver.get()
|
texts_with_ctx = receiver.get()
|
||||||
|
|
||||||
# Stop working if we encounter the end-of-work sentinel.
|
|
||||||
if isinstance(texts_with_ctx, _WorkDoneSentinel):
|
|
||||||
sender.close()
|
|
||||||
receiver.close()
|
|
||||||
return
|
|
||||||
|
|
||||||
docs = (
|
docs = (
|
||||||
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
||||||
)
|
)
|
||||||
|
@ -2409,23 +2331,11 @@ def _apply_pipes(
|
||||||
# Connection does not accept unpickable objects, so send list.
|
# Connection does not accept unpickable objects, so send list.
|
||||||
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
||||||
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
||||||
data: Sequence[Tuple[Optional[bytes], Optional[Any], Optional[bytes]]] = (
|
sender.send(byte_docs + padding) # type: ignore[operator]
|
||||||
byte_docs + padding # type: ignore[operator]
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||||||
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
||||||
data = error_msg + padding
|
sender.send(error_msg + padding)
|
||||||
|
|
||||||
try:
|
|
||||||
sender.send(data)
|
|
||||||
except BrokenPipeError:
|
|
||||||
# Parent has closed the pipe prematurely. This happens when a
|
|
||||||
# worker encounters an error and the error handler is set to
|
|
||||||
# stop processing.
|
|
||||||
sender.close()
|
|
||||||
receiver.close()
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
class _Sender:
|
class _Sender:
|
||||||
|
@ -2455,10 +2365,3 @@ class _Sender:
|
||||||
if self.count >= self.chunk_size:
|
if self.count >= self.chunk_size:
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.send()
|
self.send()
|
||||||
|
|
||||||
|
|
||||||
class _WorkDoneSentinel:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
_WORK_DONE_SENTINEL = _WorkDoneSentinel()
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ cdef class Lexeme:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) noexcept nogil:
|
cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil:
|
||||||
if name < (sizeof(flags_t) * 8):
|
if name < (sizeof(flags_t) * 8):
|
||||||
Lexeme.c_set_flag(lex, name, value)
|
Lexeme.c_set_flag(lex, name, value)
|
||||||
elif name == ID:
|
elif name == ID:
|
||||||
|
@ -54,7 +54,7 @@ cdef class Lexeme:
|
||||||
lex.lang = value
|
lex.lang = value
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) noexcept nogil:
|
cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
|
||||||
if feat_name < (sizeof(flags_t) * 8):
|
if feat_name < (sizeof(flags_t) * 8):
|
||||||
if Lexeme.c_check_flag(lex, feat_name):
|
if Lexeme.c_check_flag(lex, feat_name):
|
||||||
return 1
|
return 1
|
||||||
|
@ -82,7 +82,7 @@ cdef class Lexeme:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) noexcept nogil:
|
cdef inline bint c_check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
||||||
cdef flags_t one = 1
|
cdef flags_t one = 1
|
||||||
if lexeme.flags & (one << flag_id):
|
if lexeme.flags & (one << flag_id):
|
||||||
return True
|
return True
|
||||||
|
@ -90,7 +90,7 @@ cdef class Lexeme:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) noexcept nogil:
|
cdef inline bint c_set_flag(LexemeC* lex, attr_id_t flag_id, bint value) nogil:
|
||||||
cdef flags_t one = 1
|
cdef flags_t one = 1
|
||||||
if value:
|
if value:
|
||||||
lex.flags |= one << flag_id
|
lex.flags |= one << flag_id
|
||||||
|
|
429
spacy/lexeme.pyx
429
spacy/lexeme.pyx
|
@ -70,7 +70,7 @@ cdef class Lexeme:
|
||||||
if isinstance(other, Lexeme):
|
if isinstance(other, Lexeme):
|
||||||
a = self.orth
|
a = self.orth
|
||||||
b = other.orth
|
b = other.orth
|
||||||
elif isinstance(other, int):
|
elif isinstance(other, long):
|
||||||
a = self.orth
|
a = self.orth
|
||||||
b = other
|
b = other
|
||||||
elif isinstance(other, str):
|
elif isinstance(other, str):
|
||||||
|
@ -104,7 +104,7 @@ cdef class Lexeme:
|
||||||
# skip PROB, e.g. from lexemes.jsonl
|
# skip PROB, e.g. from lexemes.jsonl
|
||||||
if isinstance(value, float):
|
if isinstance(value, float):
|
||||||
continue
|
continue
|
||||||
elif isinstance(value, int):
|
elif isinstance(value, (int, long)):
|
||||||
Lexeme.set_struct_attr(self.c, attr, value)
|
Lexeme.set_struct_attr(self.c, attr, value)
|
||||||
else:
|
else:
|
||||||
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
Lexeme.set_struct_attr(self.c, attr, self.vocab.strings.add(value))
|
||||||
|
@ -164,48 +164,45 @@ cdef class Lexeme:
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
return numpy.sqrt((vector**2).sum())
|
return numpy.sqrt((vector**2).sum())
|
||||||
|
|
||||||
@property
|
property vector:
|
||||||
def vector(self):
|
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the lexeme's semantics.
|
representing the lexeme's semantics.
|
||||||
"""
|
"""
|
||||||
cdef int length = self.vocab.vectors_length
|
def __get__(self):
|
||||||
if length == 0:
|
cdef int length = self.vocab.vectors_length
|
||||||
raise ValueError(Errors.E010)
|
if length == 0:
|
||||||
return self.vocab.get_vector(self.c.orth)
|
raise ValueError(Errors.E010)
|
||||||
|
return self.vocab.get_vector(self.c.orth)
|
||||||
|
|
||||||
@vector.setter
|
def __set__(self, vector):
|
||||||
def vector(self, vector):
|
if len(vector) != self.vocab.vectors_length:
|
||||||
if len(vector) != self.vocab.vectors_length:
|
raise ValueError(Errors.E073.format(new_length=len(vector),
|
||||||
raise ValueError(Errors.E073.format(new_length=len(vector),
|
length=self.vocab.vectors_length))
|
||||||
length=self.vocab.vectors_length))
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
|
||||||
|
|
||||||
@property
|
property rank:
|
||||||
def rank(self):
|
|
||||||
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
||||||
to index into tables, e.g. for word vectors."""
|
to index into tables, e.g. for word vectors."""
|
||||||
return self.c.id
|
def __get__(self):
|
||||||
|
return self.c.id
|
||||||
|
|
||||||
@rank.setter
|
def __set__(self, value):
|
||||||
def rank(self, value):
|
self.c.id = value
|
||||||
self.c.id = value
|
|
||||||
|
|
||||||
@property
|
property sentiment:
|
||||||
def sentiment(self):
|
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
negativity of the lexeme."""
|
negativity of the lexeme."""
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
def __get__(self):
|
||||||
return sentiment_table.get(self.c.orth, 0.0)
|
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||||||
|
return sentiment_table.get(self.c.orth, 0.0)
|
||||||
|
|
||||||
@sentiment.setter
|
def __set__(self, float x):
|
||||||
def sentiment(self, float x):
|
if "lexeme_sentiment" not in self.vocab.lookups:
|
||||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
self.vocab.lookups.add_table("lexeme_sentiment")
|
||||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
sentiment_table[self.c.orth] = x
|
||||||
sentiment_table[self.c.orth] = x
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def orth_(self):
|
def orth_(self):
|
||||||
|
@ -219,338 +216,306 @@ cdef class Lexeme:
|
||||||
"""RETURNS (str): The original verbatim text of the lexeme."""
|
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
@property
|
property lower:
|
||||||
def lower(self):
|
|
||||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||||
return self.c.lower
|
def __get__(self):
|
||||||
|
return self.c.lower
|
||||||
|
|
||||||
@lower.setter
|
def __set__(self, attr_t x):
|
||||||
def lower(self, attr_t x):
|
self.c.lower = x
|
||||||
self.c.lower = x
|
|
||||||
|
|
||||||
@property
|
property norm:
|
||||||
def norm(self):
|
|
||||||
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
return self.c.norm
|
def __get__(self):
|
||||||
|
return self.c.norm
|
||||||
|
|
||||||
@norm.setter
|
def __set__(self, attr_t x):
|
||||||
def norm(self, attr_t x):
|
if "lexeme_norm" not in self.vocab.lookups:
|
||||||
if "lexeme_norm" not in self.vocab.lookups:
|
self.vocab.lookups.add_table("lexeme_norm")
|
||||||
self.vocab.lookups.add_table("lexeme_norm")
|
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
||||||
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
norm_table[self.c.orth] = self.vocab.strings[x]
|
||||||
norm_table[self.c.orth] = self.vocab.strings[x]
|
self.c.norm = x
|
||||||
self.c.norm = x
|
|
||||||
|
|
||||||
@property
|
property shape:
|
||||||
def shape(self):
|
|
||||||
"""RETURNS (uint64): Transform of the word's string, to show
|
"""RETURNS (uint64): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
return self.c.shape
|
def __get__(self):
|
||||||
|
return self.c.shape
|
||||||
|
|
||||||
@shape.setter
|
def __set__(self, attr_t x):
|
||||||
def shape(self, attr_t x):
|
self.c.shape = x
|
||||||
self.c.shape = x
|
|
||||||
|
|
||||||
@property
|
property prefix:
|
||||||
def prefix(self):
|
|
||||||
"""RETURNS (uint64): Length-N substring from the start of the word.
|
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
return self.c.prefix
|
def __get__(self):
|
||||||
|
return self.c.prefix
|
||||||
|
|
||||||
@prefix.setter
|
def __set__(self, attr_t x):
|
||||||
def prefix(self, attr_t x):
|
self.c.prefix = x
|
||||||
self.c.prefix = x
|
|
||||||
|
|
||||||
@property
|
property suffix:
|
||||||
def suffix(self):
|
|
||||||
"""RETURNS (uint64): Length-N substring from the end of the word.
|
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
return self.c.suffix
|
def __get__(self):
|
||||||
|
return self.c.suffix
|
||||||
|
|
||||||
@suffix.setter
|
def __set__(self, attr_t x):
|
||||||
def suffix(self, attr_t x):
|
self.c.suffix = x
|
||||||
self.c.suffix = x
|
|
||||||
|
|
||||||
@property
|
property cluster:
|
||||||
def cluster(self):
|
|
||||||
"""RETURNS (int): Brown cluster ID."""
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
def __get__(self):
|
||||||
return cluster_table.get(self.c.orth, 0)
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
|
return cluster_table.get(self.c.orth, 0)
|
||||||
|
|
||||||
@cluster.setter
|
def __set__(self, int x):
|
||||||
def cluster(self, int x):
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
cluster_table[self.c.orth] = x
|
||||||
cluster_table[self.c.orth] = x
|
|
||||||
|
|
||||||
@property
|
property lang:
|
||||||
def lang(self):
|
|
||||||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||||
return self.c.lang
|
def __get__(self):
|
||||||
|
return self.c.lang
|
||||||
|
|
||||||
@lang.setter
|
def __set__(self, attr_t x):
|
||||||
def lang(self, attr_t x):
|
self.c.lang = x
|
||||||
self.c.lang = x
|
|
||||||
|
|
||||||
@property
|
property prob:
|
||||||
def prob(self):
|
|
||||||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||||
type."""
|
type."""
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
def __get__(self):
|
||||||
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
||||||
return prob_table.get(self.c.orth, default_oov_prob)
|
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||||||
|
return prob_table.get(self.c.orth, default_oov_prob)
|
||||||
|
|
||||||
@prob.setter
|
def __set__(self, float x):
|
||||||
def prob(self, float x):
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
prob_table[self.c.orth] = x
|
||||||
prob_table[self.c.orth] = x
|
|
||||||
|
|
||||||
@property
|
property lower_:
|
||||||
def lower_(self):
|
|
||||||
"""RETURNS (str): Lowercase form of the word."""
|
"""RETURNS (str): Lowercase form of the word."""
|
||||||
return self.vocab.strings[self.c.lower]
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.lower]
|
||||||
|
|
||||||
@lower_.setter
|
def __set__(self, str x):
|
||||||
def lower_(self, str x):
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
|
||||||
|
|
||||||
@property
|
property norm_:
|
||||||
def norm_(self):
|
|
||||||
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.norm]
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.norm]
|
||||||
|
|
||||||
@norm_.setter
|
def __set__(self, str x):
|
||||||
def norm_(self, str x):
|
self.norm = self.vocab.strings.add(x)
|
||||||
self.norm = self.vocab.strings.add(x)
|
|
||||||
|
|
||||||
@property
|
property shape_:
|
||||||
def shape_(self):
|
|
||||||
"""RETURNS (str): Transform of the word's string, to show
|
"""RETURNS (str): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.shape]
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.shape]
|
||||||
|
|
||||||
@shape_.setter
|
def __set__(self, str x):
|
||||||
def shape_(self, str x):
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
|
||||||
|
|
||||||
@property
|
property prefix_:
|
||||||
def prefix_(self):
|
|
||||||
"""RETURNS (str): Length-N substring from the start of the word.
|
"""RETURNS (str): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.prefix]
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.prefix]
|
||||||
|
|
||||||
@prefix_.setter
|
def __set__(self, str x):
|
||||||
def prefix_(self, str x):
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
|
||||||
|
|
||||||
@property
|
property suffix_:
|
||||||
def suffix_(self):
|
|
||||||
"""RETURNS (str): Length-N substring from the end of the word.
|
"""RETURNS (str): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.suffix]
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.suffix]
|
||||||
|
|
||||||
@suffix_.setter
|
def __set__(self, str x):
|
||||||
def suffix_(self, str x):
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
|
||||||
|
|
||||||
@property
|
property lang_:
|
||||||
def lang_(self):
|
|
||||||
"""RETURNS (str): Language of the parent vocabulary."""
|
"""RETURNS (str): Language of the parent vocabulary."""
|
||||||
return self.vocab.strings[self.c.lang]
|
def __get__(self):
|
||||||
|
return self.vocab.strings[self.c.lang]
|
||||||
|
|
||||||
@lang_.setter
|
def __set__(self, str x):
|
||||||
def lang_(self, str x):
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
self.c.lang = self.vocab.strings.add(x)
|
|
||||||
|
|
||||||
@property
|
property flags:
|
||||||
def flags(self):
|
|
||||||
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||||||
return self.c.flags
|
def __get__(self):
|
||||||
|
return self.c.flags
|
||||||
|
|
||||||
@flags.setter
|
def __set__(self, flags_t x):
|
||||||
def flags(self, flags_t x):
|
self.c.flags = x
|
||||||
self.c.flags = x
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_oov(self):
|
def is_oov(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||||
return self.orth not in self.vocab.vectors
|
return self.orth not in self.vocab.vectors
|
||||||
|
|
||||||
@property
|
property is_stop:
|
||||||
def is_stop(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_STOP)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||||
|
|
||||||
@is_stop.setter
|
def __set__(self, bint x):
|
||||||
def is_stop(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
|
||||||
|
|
||||||
@property
|
property is_alpha:
|
||||||
def is_alpha(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
||||||
characters. Equivalent to `lexeme.text.isalpha()`.
|
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||||
|
|
||||||
@is_alpha.setter
|
def __set__(self, bint x):
|
||||||
def is_alpha(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
|
||||||
|
|
||||||
@property
|
property is_ascii:
|
||||||
def is_ascii(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||||||
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||||
|
|
||||||
@is_ascii.setter
|
def __set__(self, bint x):
|
||||||
def is_ascii(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
|
||||||
|
|
||||||
@property
|
property is_digit:
|
||||||
def is_digit(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||||||
to `lexeme.text.isdigit()`.
|
to `lexeme.text.isdigit()`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||||
|
|
||||||
@is_digit.setter
|
def __set__(self, bint x):
|
||||||
def is_digit(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
|
||||||
|
|
||||||
@property
|
property is_lower:
|
||||||
def is_lower(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||||||
`lexeme.text.islower()`.
|
`lexeme.text.islower()`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||||
|
|
||||||
@is_lower.setter
|
def __set__(self, bint x):
|
||||||
def is_lower(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
|
||||||
|
|
||||||
@property
|
property is_upper:
|
||||||
def is_upper(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||||||
`lexeme.text.isupper()`.
|
`lexeme.text.isupper()`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||||||
|
|
||||||
@is_upper.setter
|
def __set__(self, bint x):
|
||||||
def is_upper(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
|
||||||
|
|
||||||
@property
|
property is_title:
|
||||||
def is_title(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||||||
`lexeme.text.istitle()`.
|
`lexeme.text.istitle()`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||||
|
|
||||||
@is_title.setter
|
def __set__(self, bint x):
|
||||||
def is_title(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
|
||||||
|
|
||||||
@property
|
property is_punct:
|
||||||
def is_punct(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||||
|
|
||||||
@is_punct.setter
|
def __set__(self, bint x):
|
||||||
def is_punct(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
|
||||||
|
|
||||||
@property
|
property is_space:
|
||||||
def is_space(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||||||
Equivalent to `lexeme.text.isspace()`.
|
Equivalent to `lexeme.text.isspace()`.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||||
|
|
||||||
@is_space.setter
|
def __set__(self, bint x):
|
||||||
def is_space(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
|
||||||
|
|
||||||
@property
|
property is_bracket:
|
||||||
def is_bracket(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||||
|
|
||||||
@is_bracket.setter
|
def __set__(self, bint x):
|
||||||
def is_bracket(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
|
||||||
|
|
||||||
@property
|
property is_quote:
|
||||||
def is_quote(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||||
|
|
||||||
@is_quote.setter
|
def __set__(self, bint x):
|
||||||
def is_quote(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
|
||||||
|
|
||||||
@property
|
property is_left_punct:
|
||||||
def is_left_punct(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
@is_left_punct.setter
|
def __set__(self, bint x):
|
||||||
def is_left_punct(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
|
||||||
|
|
||||||
@property
|
property is_right_punct:
|
||||||
def is_right_punct(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
@is_right_punct.setter
|
def __set__(self, bint x):
|
||||||
def is_right_punct(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
|
||||||
|
|
||||||
@property
|
property is_currency:
|
||||||
def is_currency(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
||||||
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
||||||
|
|
||||||
@is_currency.setter
|
def __set__(self, bint x):
|
||||||
def is_currency(self, bint x):
|
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
||||||
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
|
||||||
|
|
||||||
@property
|
property like_url:
|
||||||
def like_url(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
|
|
||||||
@like_url.setter
|
def __set__(self, bint x):
|
||||||
def like_url(self, bint x):
|
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
|
||||||
|
|
||||||
@property
|
property like_num:
|
||||||
def like_num(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||||||
"10", "ten", etc.
|
"10", "ten", etc.
|
||||||
"""
|
"""
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||||
|
|
||||||
@like_num.setter
|
def __set__(self, bint x):
|
||||||
def like_num(self, bint x):
|
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||||
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
|
||||||
|
|
||||||
@property
|
property like_email:
|
||||||
def like_email(self):
|
|
||||||
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
def __get__(self):
|
||||||
|
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||||
|
|
||||||
@like_email.setter
|
def __set__(self, bint x):
|
||||||
def like_email(self, bint x):
|
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||||
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
|
||||||
|
|
|
@ -3,4 +3,4 @@ from .levenshtein import levenshtein
|
||||||
from .matcher import Matcher
|
from .matcher import Matcher
|
||||||
from .phrasematcher import PhraseMatcher
|
from .phrasematcher import PhraseMatcher
|
||||||
|
|
||||||
__all__ = ["DependencyMatcher", "Matcher", "PhraseMatcher", "levenshtein"]
|
__all__ = ["Matcher", "PhraseMatcher", "DependencyMatcher", "levenshtein"]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# cython: binding=True, infer_types=True, language_level=3
|
# cython: binding=True, infer_types=True
|
||||||
from cpython.object cimport PyObject
|
from cpython.object cimport PyObject
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
|
@ -27,5 +27,6 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
|
||||||
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.levenshtein_compare.v1")
|
||||||
def make_levenshtein_compare():
|
def make_levenshtein_compare():
|
||||||
return levenshtein_compare
|
return levenshtein_compare
|
||||||
|
|
|
@ -625,7 +625,7 @@ cdef action_t get_action(
|
||||||
const TokenC * token,
|
const TokenC * token,
|
||||||
const attr_t * extra_attrs,
|
const attr_t * extra_attrs,
|
||||||
const int8_t * predicate_matches
|
const int8_t * predicate_matches
|
||||||
) noexcept nogil:
|
) nogil:
|
||||||
"""We need to consider:
|
"""We need to consider:
|
||||||
a) Does the token match the specification? [Yes, No]
|
a) Does the token match the specification? [Yes, No]
|
||||||
b) What's the quantifier? [1, 0+, ?]
|
b) What's the quantifier? [1, 0+, ?]
|
||||||
|
@ -740,7 +740,7 @@ cdef int8_t get_is_match(
|
||||||
const TokenC* token,
|
const TokenC* token,
|
||||||
const attr_t* extra_attrs,
|
const attr_t* extra_attrs,
|
||||||
const int8_t* predicate_matches
|
const int8_t* predicate_matches
|
||||||
) noexcept nogil:
|
) nogil:
|
||||||
for i in range(state.pattern.nr_py):
|
for i in range(state.pattern.nr_py):
|
||||||
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
if predicate_matches[state.pattern.py_predicates[i]] == -1:
|
||||||
return 0
|
return 0
|
||||||
|
@ -755,14 +755,14 @@ cdef int8_t get_is_match(
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
cdef inline int8_t get_is_final(PatternStateC state) noexcept nogil:
|
cdef inline int8_t get_is_final(PatternStateC state) nogil:
|
||||||
if state.pattern[1].quantifier == FINAL_ID:
|
if state.pattern[1].quantifier == FINAL_ID:
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
cdef inline int8_t get_quantifier(PatternStateC state) noexcept nogil:
|
cdef inline int8_t get_quantifier(PatternStateC state) nogil:
|
||||||
return state.pattern.quantifier
|
return state.pattern.quantifier
|
||||||
|
|
||||||
|
|
||||||
|
@ -805,7 +805,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
|
||||||
return pattern
|
return pattern
|
||||||
|
|
||||||
|
|
||||||
cdef attr_t get_ent_id(const TokenPatternC* pattern) noexcept nogil:
|
cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
|
||||||
while pattern.quantifier != FINAL_ID:
|
while pattern.quantifier != FINAL_ID:
|
||||||
pattern += 1
|
pattern += 1
|
||||||
id_attr = pattern[0].attrs[0]
|
id_attr = pattern[0].attrs[0]
|
||||||
|
|
|
@ -47,7 +47,7 @@ cdef class PhraseMatcher:
|
||||||
self._terminal_hash = 826361138722620965
|
self._terminal_hash = 826361138722620965
|
||||||
map_init(self.mem, self.c_map, 8)
|
map_init(self.mem, self.c_map, 8)
|
||||||
|
|
||||||
if isinstance(attr, int):
|
if isinstance(attr, (int, long)):
|
||||||
self.attr = attr
|
self.attr = attr
|
||||||
else:
|
else:
|
||||||
if attr is None:
|
if attr is None:
|
||||||
|
|
|
@ -7,6 +7,7 @@ from ..tokens import Doc
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.CharEmbed.v1")
|
||||||
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
# nM: Number of dimensions per character. nC: Number of characters.
|
# nM: Number of dimensions per character. nC: Number of characters.
|
||||||
return Model(
|
return Model(
|
||||||
|
|
|
@ -3,6 +3,7 @@ from thinc.api import Model, normal_init
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.PrecomputableAffine.v1")
|
||||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||||
model = Model(
|
model = Model(
|
||||||
"precomputable_affine",
|
"precomputable_affine",
|
||||||
|
|
|
@ -50,6 +50,7 @@ def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
@registry.callbacks("spacy.models_with_nvtx_range.v1")
|
||||||
def create_models_with_nvtx_range(
|
def create_models_with_nvtx_range(
|
||||||
forward_color: int = -1, backprop_color: int = -1
|
forward_color: int = -1, backprop_color: int = -1
|
||||||
) -> Callable[["Language"], "Language"]:
|
) -> Callable[["Language"], "Language"]:
|
||||||
|
@ -109,6 +110,7 @@ def pipes_with_nvtx_range(
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
|
||||||
def create_models_and_pipes_with_nvtx_range(
|
def create_models_and_pipes_with_nvtx_range(
|
||||||
forward_color: int = -1,
|
forward_color: int = -1,
|
||||||
backprop_color: int = -1,
|
backprop_color: int = -1,
|
||||||
|
|
|
@ -4,6 +4,7 @@ from ..attrs import LOWER
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.extract_ngrams.v1")
|
||||||
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||||
model: Model = Model("extract_ngrams", forward)
|
model: Model = Model("extract_ngrams", forward)
|
||||||
model.attrs["ngram_size"] = ngram_size
|
model.attrs["ngram_size"] = ngram_size
|
||||||
|
|
|
@ -6,6 +6,7 @@ from thinc.types import Ints1d, Ragged
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.extract_spans.v1")
|
||||||
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
|
def extract_spans() -> Model[Tuple[Ragged, Ragged], Ragged]:
|
||||||
"""Extract spans from a sequence of source arrays, as specified by an array
|
"""Extract spans from a sequence of source arrays, as specified by an array
|
||||||
of (start, end) indices. The output is a ragged array of the
|
of (start, end) indices. The output is a ragged array of the
|
||||||
|
|
|
@ -6,9 +6,8 @@ from thinc.types import Ints2d
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
def FeatureExtractor(
|
@registry.layers("spacy.FeatureExtractor.v1")
|
||||||
columns: Union[List[str], List[int], List[Union[int, str]]]
|
def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
|
||||||
) -> Model[List[Doc], List[Ints2d]]:
|
|
||||||
return Model("extract_features", forward, attrs={"columns": columns})
|
return Model("extract_features", forward, attrs={"columns": columns})
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,7 @@ from ...vocab import Vocab
|
||||||
from ..extract_spans import extract_spans
|
from ..extract_spans import extract_spans
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.EntityLinker.v2")
|
||||||
def build_nel_encoder(
|
def build_nel_encoder(
|
||||||
tok2vec: Model, nO: Optional[int] = None
|
tok2vec: Model, nO: Optional[int] = None
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
@ -91,6 +92,7 @@ def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callab
|
||||||
return out, lambda x: []
|
return out, lambda x: []
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.KBFromFile.v1")
|
||||||
def load_kb(
|
def load_kb(
|
||||||
kb_path: Path,
|
kb_path: Path,
|
||||||
) -> Callable[[Vocab], KnowledgeBase]:
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
@ -102,6 +104,7 @@ def load_kb(
|
||||||
return kb_from_file
|
return kb_from_file
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.EmptyKB.v2")
|
||||||
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||||
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
def empty_kb_factory(vocab: Vocab, entity_vector_length: int):
|
||||||
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length)
|
||||||
|
@ -109,6 +112,7 @@ def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]:
|
||||||
return empty_kb_factory
|
return empty_kb_factory
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.EmptyKB.v1")
|
||||||
def empty_kb(
|
def empty_kb(
|
||||||
entity_vector_length: int,
|
entity_vector_length: int,
|
||||||
) -> Callable[[Vocab], KnowledgeBase]:
|
) -> Callable[[Vocab], KnowledgeBase]:
|
||||||
|
@ -118,10 +122,12 @@ def empty_kb(
|
||||||
return empty_kb_factory
|
return empty_kb_factory
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.CandidateGenerator.v1")
|
||||||
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
||||||
return get_candidates
|
return get_candidates
|
||||||
|
|
||||||
|
|
||||||
|
@registry.misc("spacy.CandidateBatchGenerator.v1")
|
||||||
def create_candidates_batch() -> Callable[
|
def create_candidates_batch() -> Callable[
|
||||||
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
|
||||||
]:
|
]:
|
||||||
|
|
|
@ -30,6 +30,7 @@ if TYPE_CHECKING:
|
||||||
from ...vocab import Vocab # noqa: F401
|
from ...vocab import Vocab # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.PretrainVectors.v1")
|
||||||
def create_pretrain_vectors(
|
def create_pretrain_vectors(
|
||||||
maxout_pieces: int, hidden_size: int, loss: str
|
maxout_pieces: int, hidden_size: int, loss: str
|
||||||
) -> Callable[["Vocab", Model], Model]:
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
|
@ -56,6 +57,7 @@ def create_pretrain_vectors(
|
||||||
return create_vectors_objective
|
return create_vectors_objective
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.PretrainCharacters.v1")
|
||||||
def create_pretrain_characters(
|
def create_pretrain_characters(
|
||||||
maxout_pieces: int, hidden_size: int, n_characters: int
|
maxout_pieces: int, hidden_size: int, n_characters: int
|
||||||
) -> Callable[["Vocab", Model], Model]:
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
|
|
|
@ -11,6 +11,7 @@ from .._precomputable_affine import PrecomputableAffine
|
||||||
from ..tb_framework import TransitionModel
|
from ..tb_framework import TransitionModel
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TransitionBasedParser.v2")
|
||||||
def build_tb_parser_model(
|
def build_tb_parser_model(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
state_type: Literal["parser", "ner"],
|
state_type: Literal["parser", "ner"],
|
||||||
|
|
|
@ -10,6 +10,7 @@ InT = List[Doc]
|
||||||
OutT = Floats2d
|
OutT = Floats2d
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.SpanFinder.v1")
|
||||||
def build_finder_model(
|
def build_finder_model(
|
||||||
tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
|
tok2vec: Model[InT, List[Floats2d]], scorer: Model[OutT, OutT]
|
||||||
) -> Model[InT, OutT]:
|
) -> Model[InT, OutT]:
|
||||||
|
|
|
@ -22,6 +22,7 @@ from ...util import registry
|
||||||
from ..extract_spans import extract_spans
|
from ..extract_spans import extract_spans
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.LinearLogistic.v1")
|
||||||
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
||||||
"""An output layer for multi-label classification. It uses a linear layer
|
"""An output layer for multi-label classification. It uses a linear layer
|
||||||
followed by a logistic activation.
|
followed by a logistic activation.
|
||||||
|
@ -29,6 +30,7 @@ def build_linear_logistic(nO=None, nI=None) -> Model[Floats2d, Floats2d]:
|
||||||
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
|
return chain(Linear(nO=nO, nI=nI, init_W=glorot_uniform_init), Logistic())
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.mean_max_reducer.v1")
|
||||||
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
|
def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
|
||||||
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
"""Reduce sequences by concatenating their mean and max pooled vectors,
|
||||||
and then combine the concatenated vectors with a hidden layer.
|
and then combine the concatenated vectors with a hidden layer.
|
||||||
|
@ -44,6 +46,7 @@ def build_mean_max_reducer(hidden_size: int) -> Model[Ragged, Floats2d]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.SpanCategorizer.v1")
|
||||||
def build_spancat_model(
|
def build_spancat_model(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
reducer: Model[Ragged, Floats2d],
|
reducer: Model[Ragged, Floats2d],
|
||||||
|
|
|
@ -7,6 +7,7 @@ from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.Tagger.v2")
|
||||||
def build_tagger_model(
|
def build_tagger_model(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
|
tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None, normalize=False
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
|
|
@ -1,27 +1,21 @@
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import List, Optional, Tuple, cast
|
from typing import List, Optional, cast
|
||||||
|
|
||||||
from thinc.api import (
|
from thinc.api import (
|
||||||
Dropout,
|
Dropout,
|
||||||
Gelu,
|
|
||||||
LayerNorm,
|
LayerNorm,
|
||||||
Linear,
|
Linear,
|
||||||
Logistic,
|
Logistic,
|
||||||
Maxout,
|
Maxout,
|
||||||
Model,
|
Model,
|
||||||
ParametricAttention,
|
ParametricAttention,
|
||||||
ParametricAttention_v2,
|
|
||||||
Relu,
|
Relu,
|
||||||
Softmax,
|
Softmax,
|
||||||
SparseLinear,
|
SparseLinear,
|
||||||
SparseLinear_v2,
|
|
||||||
chain,
|
chain,
|
||||||
clone,
|
clone,
|
||||||
concatenate,
|
concatenate,
|
||||||
list2ragged,
|
list2ragged,
|
||||||
reduce_first,
|
|
||||||
reduce_last,
|
|
||||||
reduce_max,
|
|
||||||
reduce_mean,
|
reduce_mean,
|
||||||
reduce_sum,
|
reduce_sum,
|
||||||
residual,
|
residual,
|
||||||
|
@ -31,10 +25,9 @@ from thinc.api import (
|
||||||
)
|
)
|
||||||
from thinc.layers.chain import init as init_chain
|
from thinc.layers.chain import init as init_chain
|
||||||
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
from thinc.layers.resizable import resize_linear_weighted, resize_model
|
||||||
from thinc.types import ArrayXd, Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ...attrs import ORTH
|
from ...attrs import ORTH
|
||||||
from ...errors import Errors
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
@ -44,6 +37,7 @@ from .tok2vec import get_tok2vec_width
|
||||||
NEG_VALUE = -5000
|
NEG_VALUE = -5000
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatCNN.v2")
|
||||||
def build_simple_cnn_text_classifier(
|
def build_simple_cnn_text_classifier(
|
||||||
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
@ -53,15 +47,39 @@ def build_simple_cnn_text_classifier(
|
||||||
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
|
||||||
is applied instead, so that outputs are in the range [0, 1].
|
is applied instead, so that outputs are in the range [0, 1].
|
||||||
"""
|
"""
|
||||||
return build_reduce_text_classifier(
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
tok2vec=tok2vec,
|
with Model.define_operators({">>": chain}):
|
||||||
exclusive_classes=exclusive_classes,
|
cnn = tok2vec >> list2ragged() >> reduce_mean()
|
||||||
use_reduce_first=False,
|
nI = tok2vec.maybe_get_dim("nO")
|
||||||
use_reduce_last=False,
|
if exclusive_classes:
|
||||||
use_reduce_max=False,
|
output_layer = Softmax(nO=nO, nI=nI)
|
||||||
use_reduce_mean=True,
|
fill_defaults["b"] = NEG_VALUE
|
||||||
nO=nO,
|
resizable_layer: Model = resizable(
|
||||||
)
|
output_layer,
|
||||||
|
resize_layer=partial(
|
||||||
|
resize_linear_weighted, fill_defaults=fill_defaults
|
||||||
|
),
|
||||||
|
)
|
||||||
|
model = cnn >> resizable_layer
|
||||||
|
else:
|
||||||
|
output_layer = Linear(nO=nO, nI=nI)
|
||||||
|
resizable_layer = resizable(
|
||||||
|
output_layer,
|
||||||
|
resize_layer=partial(
|
||||||
|
resize_linear_weighted, fill_defaults=fill_defaults
|
||||||
|
),
|
||||||
|
)
|
||||||
|
model = cnn >> resizable_layer >> Logistic()
|
||||||
|
model.set_ref("output_layer", output_layer)
|
||||||
|
model.attrs["resize_output"] = partial(
|
||||||
|
resize_and_set_ref,
|
||||||
|
resizable_layer=resizable_layer,
|
||||||
|
)
|
||||||
|
model.set_ref("tok2vec", tok2vec)
|
||||||
|
if nO is not None:
|
||||||
|
model.set_dim("nO", cast(int, nO))
|
||||||
|
model.attrs["multi_label"] = not exclusive_classes
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
def resize_and_set_ref(model, new_nO, resizable_layer):
|
def resize_and_set_ref(model, new_nO, resizable_layer):
|
||||||
|
@ -71,52 +89,16 @@ def resize_and_set_ref(model, new_nO, resizable_layer):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatBOW.v2")
|
||||||
def build_bow_text_classifier(
|
def build_bow_text_classifier(
|
||||||
exclusive_classes: bool,
|
exclusive_classes: bool,
|
||||||
ngram_size: int,
|
ngram_size: int,
|
||||||
no_output_layer: bool,
|
no_output_layer: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
|
||||||
return _build_bow_text_classifier(
|
|
||||||
exclusive_classes=exclusive_classes,
|
|
||||||
ngram_size=ngram_size,
|
|
||||||
no_output_layer=no_output_layer,
|
|
||||||
nO=nO,
|
|
||||||
sparse_linear=SparseLinear(nO=nO),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def build_bow_text_classifier_v3(
|
|
||||||
exclusive_classes: bool,
|
|
||||||
ngram_size: int,
|
|
||||||
no_output_layer: bool,
|
|
||||||
length: int = 262144,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model[List[Doc], Floats2d]:
|
|
||||||
if length < 1:
|
|
||||||
raise ValueError(Errors.E1056.format(length=length))
|
|
||||||
|
|
||||||
# Find k such that 2**(k-1) < length <= 2**k.
|
|
||||||
length = 2 ** (length - 1).bit_length()
|
|
||||||
|
|
||||||
return _build_bow_text_classifier(
|
|
||||||
exclusive_classes=exclusive_classes,
|
|
||||||
ngram_size=ngram_size,
|
|
||||||
no_output_layer=no_output_layer,
|
|
||||||
nO=nO,
|
|
||||||
sparse_linear=SparseLinear_v2(nO=nO, length=length),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _build_bow_text_classifier(
|
|
||||||
exclusive_classes: bool,
|
|
||||||
ngram_size: int,
|
|
||||||
no_output_layer: bool,
|
|
||||||
sparse_linear: Model[Tuple[ArrayXd, ArrayXd, ArrayXd], ArrayXd],
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
fill_defaults = {"b": 0, "W": 0}
|
fill_defaults = {"b": 0, "W": 0}
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
|
sparse_linear = SparseLinear(nO=nO)
|
||||||
output_layer = None
|
output_layer = None
|
||||||
if not no_output_layer:
|
if not no_output_layer:
|
||||||
fill_defaults["b"] = NEG_VALUE
|
fill_defaults["b"] = NEG_VALUE
|
||||||
|
@ -139,14 +121,12 @@ def _build_bow_text_classifier(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatEnsemble.v2")
|
||||||
def build_text_classifier_v2(
|
def build_text_classifier_v2(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
linear_model: Model[List[Doc], Floats2d],
|
linear_model: Model[List[Doc], Floats2d],
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
# TODO: build the model with _build_parametric_attention_with_residual_nonlinear
|
|
||||||
# in spaCy v4. We don't do this in spaCy v3 to preserve model
|
|
||||||
# compatibility.
|
|
||||||
exclusive_classes = not linear_model.attrs["multi_label"]
|
exclusive_classes = not linear_model.attrs["multi_label"]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
with Model.define_operators({">>": chain, "|": concatenate}):
|
||||||
width = tok2vec.maybe_get_dim("nO")
|
width = tok2vec.maybe_get_dim("nO")
|
||||||
|
@ -181,11 +161,6 @@ def build_text_classifier_v2(
|
||||||
|
|
||||||
|
|
||||||
def init_ensemble_textcat(model, X, Y) -> Model:
|
def init_ensemble_textcat(model, X, Y) -> Model:
|
||||||
# When tok2vec is lazily initialized, we need to initialize it before
|
|
||||||
# the rest of the chain to ensure that we can get its width.
|
|
||||||
tok2vec = model.get_ref("tok2vec")
|
|
||||||
tok2vec.initialize(X)
|
|
||||||
|
|
||||||
tok2vec_width = get_tok2vec_width(model)
|
tok2vec_width = get_tok2vec_width(model)
|
||||||
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
||||||
model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
|
model.get_ref("maxout_layer").set_dim("nO", tok2vec_width)
|
||||||
|
@ -196,6 +171,7 @@ def init_ensemble_textcat(model, X, Y) -> Model:
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TextCatLowData.v1")
|
||||||
def build_text_classifier_lowdata(
|
def build_text_classifier_lowdata(
|
||||||
width: int, dropout: Optional[float], nO: Optional[int] = None
|
width: int, dropout: Optional[float], nO: Optional[int] = None
|
||||||
) -> Model[List[Doc], Floats2d]:
|
) -> Model[List[Doc], Floats2d]:
|
||||||
|
@ -214,151 +190,3 @@ def build_text_classifier_lowdata(
|
||||||
model = model >> Dropout(dropout)
|
model = model >> Dropout(dropout)
|
||||||
model = model >> Logistic()
|
model = model >> Logistic()
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_textcat_parametric_attention_v1(
|
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
||||||
exclusive_classes: bool,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model[List[Doc], Floats2d]:
|
|
||||||
width = tok2vec.maybe_get_dim("nO")
|
|
||||||
parametric_attention = _build_parametric_attention_with_residual_nonlinear(
|
|
||||||
tok2vec=tok2vec,
|
|
||||||
nonlinear_layer=Maxout(nI=width, nO=width),
|
|
||||||
key_transform=Gelu(nI=width, nO=width),
|
|
||||||
)
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
if exclusive_classes:
|
|
||||||
output_layer = Softmax(nO=nO)
|
|
||||||
else:
|
|
||||||
output_layer = Linear(nO=nO) >> Logistic()
|
|
||||||
model = parametric_attention >> output_layer
|
|
||||||
if model.has_dim("nO") is not False and nO is not None:
|
|
||||||
model.set_dim("nO", cast(int, nO))
|
|
||||||
model.set_ref("output_layer", output_layer)
|
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def _build_parametric_attention_with_residual_nonlinear(
|
|
||||||
*,
|
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
|
||||||
nonlinear_layer: Model[Floats2d, Floats2d],
|
|
||||||
key_transform: Optional[Model[Floats2d, Floats2d]] = None,
|
|
||||||
) -> Model[List[Doc], Floats2d]:
|
|
||||||
with Model.define_operators({">>": chain, "|": concatenate}):
|
|
||||||
width = tok2vec.maybe_get_dim("nO")
|
|
||||||
attention_layer = ParametricAttention_v2(nO=width, key_transform=key_transform)
|
|
||||||
norm_layer = LayerNorm(nI=width)
|
|
||||||
parametric_attention = (
|
|
||||||
tok2vec
|
|
||||||
>> list2ragged()
|
|
||||||
>> attention_layer
|
|
||||||
>> reduce_sum()
|
|
||||||
>> residual(nonlinear_layer >> norm_layer >> Dropout(0.0))
|
|
||||||
)
|
|
||||||
|
|
||||||
parametric_attention.init = _init_parametric_attention_with_residual_nonlinear
|
|
||||||
|
|
||||||
parametric_attention.set_ref("tok2vec", tok2vec)
|
|
||||||
parametric_attention.set_ref("attention_layer", attention_layer)
|
|
||||||
parametric_attention.set_ref("key_transform", key_transform)
|
|
||||||
parametric_attention.set_ref("nonlinear_layer", nonlinear_layer)
|
|
||||||
parametric_attention.set_ref("norm_layer", norm_layer)
|
|
||||||
|
|
||||||
return parametric_attention
|
|
||||||
|
|
||||||
|
|
||||||
def _init_parametric_attention_with_residual_nonlinear(model, X, Y) -> Model:
|
|
||||||
# When tok2vec is lazily initialized, we need to initialize it before
|
|
||||||
# the rest of the chain to ensure that we can get its width.
|
|
||||||
tok2vec = model.get_ref("tok2vec")
|
|
||||||
tok2vec.initialize(X)
|
|
||||||
|
|
||||||
tok2vec_width = get_tok2vec_width(model)
|
|
||||||
model.get_ref("attention_layer").set_dim("nO", tok2vec_width)
|
|
||||||
model.get_ref("key_transform").set_dim("nI", tok2vec_width)
|
|
||||||
model.get_ref("key_transform").set_dim("nO", tok2vec_width)
|
|
||||||
model.get_ref("nonlinear_layer").set_dim("nI", tok2vec_width)
|
|
||||||
model.get_ref("nonlinear_layer").set_dim("nO", tok2vec_width)
|
|
||||||
model.get_ref("norm_layer").set_dim("nI", tok2vec_width)
|
|
||||||
model.get_ref("norm_layer").set_dim("nO", tok2vec_width)
|
|
||||||
init_chain(model, X, Y)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def build_reduce_text_classifier(
|
|
||||||
tok2vec: Model,
|
|
||||||
exclusive_classes: bool,
|
|
||||||
use_reduce_first: bool,
|
|
||||||
use_reduce_last: bool,
|
|
||||||
use_reduce_max: bool,
|
|
||||||
use_reduce_mean: bool,
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model[List[Doc], Floats2d]:
|
|
||||||
"""Build a model that classifies pooled `Doc` representations.
|
|
||||||
|
|
||||||
Pooling is performed using reductions. Reductions are concatenated when
|
|
||||||
multiple reductions are used.
|
|
||||||
|
|
||||||
tok2vec (Model): the tok2vec layer to pool over.
|
|
||||||
exclusive_classes (bool): Whether or not classes are mutually exclusive.
|
|
||||||
use_reduce_first (bool): Pool by using the hidden representation of the
|
|
||||||
first token of a `Doc`.
|
|
||||||
use_reduce_last (bool): Pool by using the hidden representation of the
|
|
||||||
last token of a `Doc`.
|
|
||||||
use_reduce_max (bool): Pool by taking the maximum values of the hidden
|
|
||||||
representations of a `Doc`.
|
|
||||||
use_reduce_mean (bool): Pool by taking the mean of all hidden
|
|
||||||
representations of a `Doc`.
|
|
||||||
nO (Optional[int]): Number of classes.
|
|
||||||
"""
|
|
||||||
|
|
||||||
fill_defaults = {"b": 0, "W": 0}
|
|
||||||
reductions = []
|
|
||||||
if use_reduce_first:
|
|
||||||
reductions.append(reduce_first())
|
|
||||||
if use_reduce_last:
|
|
||||||
reductions.append(reduce_last())
|
|
||||||
if use_reduce_max:
|
|
||||||
reductions.append(reduce_max())
|
|
||||||
if use_reduce_mean:
|
|
||||||
reductions.append(reduce_mean())
|
|
||||||
|
|
||||||
if not len(reductions):
|
|
||||||
raise ValueError(Errors.E1057)
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain}):
|
|
||||||
cnn = tok2vec >> list2ragged() >> concatenate(*reductions)
|
|
||||||
nO_tok2vec = tok2vec.maybe_get_dim("nO")
|
|
||||||
nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None
|
|
||||||
if exclusive_classes:
|
|
||||||
output_layer = Softmax(nO=nO, nI=nI)
|
|
||||||
fill_defaults["b"] = NEG_VALUE
|
|
||||||
resizable_layer: Model = resizable(
|
|
||||||
output_layer,
|
|
||||||
resize_layer=partial(
|
|
||||||
resize_linear_weighted, fill_defaults=fill_defaults
|
|
||||||
),
|
|
||||||
)
|
|
||||||
model = cnn >> resizable_layer
|
|
||||||
else:
|
|
||||||
output_layer = Linear(nO=nO, nI=nI)
|
|
||||||
resizable_layer = resizable(
|
|
||||||
output_layer,
|
|
||||||
resize_layer=partial(
|
|
||||||
resize_linear_weighted, fill_defaults=fill_defaults
|
|
||||||
),
|
|
||||||
)
|
|
||||||
model = cnn >> resizable_layer >> Logistic()
|
|
||||||
model.set_ref("output_layer", output_layer)
|
|
||||||
model.attrs["resize_output"] = partial(
|
|
||||||
resize_and_set_ref,
|
|
||||||
resizable_layer=resizable_layer,
|
|
||||||
)
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
if nO is not None:
|
|
||||||
model.set_dim("nO", cast(int, nO))
|
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
|
||||||
return model
|
|
||||||
|
|
|
@ -29,6 +29,7 @@ from ..featureextractor import FeatureExtractor
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.Tok2VecListener.v1")
|
||||||
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
||||||
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
@ -45,6 +46,7 @@ def get_tok2vec_width(model: Model):
|
||||||
return nO
|
return nO
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.HashEmbedCNN.v2")
|
||||||
def build_hash_embed_cnn_tok2vec(
|
def build_hash_embed_cnn_tok2vec(
|
||||||
*,
|
*,
|
||||||
width: int,
|
width: int,
|
||||||
|
@ -100,6 +102,7 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.Tok2Vec.v2")
|
||||||
def build_Tok2Vec_model(
|
def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
encode: Model[List[Floats2d], List[Floats2d]],
|
encode: Model[List[Floats2d], List[Floats2d]],
|
||||||
|
@ -120,9 +123,10 @@ def build_Tok2Vec_model(
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.MultiHashEmbed.v2")
|
||||||
def MultiHashEmbed(
|
def MultiHashEmbed(
|
||||||
width: int,
|
width: int,
|
||||||
attrs: Union[List[str], List[int], List[Union[str, int]]],
|
attrs: List[Union[str, int]],
|
||||||
rows: List[int],
|
rows: List[int],
|
||||||
include_static_vectors: bool,
|
include_static_vectors: bool,
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
) -> Model[List[Doc], List[Floats2d]]:
|
||||||
|
@ -188,7 +192,7 @@ def MultiHashEmbed(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = chain(
|
model = chain(
|
||||||
FeatureExtractor(attrs),
|
FeatureExtractor(list(attrs)),
|
||||||
cast(Model[List[Ints2d], Ragged], list2ragged()),
|
cast(Model[List[Ints2d], Ragged], list2ragged()),
|
||||||
with_array(concatenate(*embeddings)),
|
with_array(concatenate(*embeddings)),
|
||||||
max_out,
|
max_out,
|
||||||
|
@ -197,6 +201,7 @@ def MultiHashEmbed(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.CharacterEmbed.v2")
|
||||||
def CharacterEmbed(
|
def CharacterEmbed(
|
||||||
width: int,
|
width: int,
|
||||||
rows: int,
|
rows: int,
|
||||||
|
@ -273,6 +278,7 @@ def CharacterEmbed(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.MaxoutWindowEncoder.v2")
|
||||||
def MaxoutWindowEncoder(
|
def MaxoutWindowEncoder(
|
||||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
@ -304,6 +310,7 @@ def MaxoutWindowEncoder(
|
||||||
return with_array(model, pad=receptive_field)
|
return with_array(model, pad=receptive_field)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.MishWindowEncoder.v2")
|
||||||
def MishWindowEncoder(
|
def MishWindowEncoder(
|
||||||
width: int, window_size: int, depth: int
|
width: int, window_size: int, depth: int
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
@ -326,6 +333,7 @@ def MishWindowEncoder(
|
||||||
return with_array(model)
|
return with_array(model)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
|
||||||
def BiLSTMEncoder(
|
def BiLSTMEncoder(
|
||||||
width: int, depth: int, dropout: float
|
width: int, depth: int, dropout: float
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||||
|
|
|
@ -52,14 +52,14 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
cdef ActivationsC alloc_activations(SizesC n) noexcept nogil:
|
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
||||||
cdef ActivationsC A
|
cdef ActivationsC A
|
||||||
memset(&A, 0, sizeof(A))
|
memset(&A, 0, sizeof(A))
|
||||||
resize_activations(&A, n)
|
resize_activations(&A, n)
|
||||||
return A
|
return A
|
||||||
|
|
||||||
|
|
||||||
cdef void free_activations(const ActivationsC* A) noexcept nogil:
|
cdef void free_activations(const ActivationsC* A) nogil:
|
||||||
free(A.token_ids)
|
free(A.token_ids)
|
||||||
free(A.scores)
|
free(A.scores)
|
||||||
free(A.unmaxed)
|
free(A.unmaxed)
|
||||||
|
@ -67,7 +67,7 @@ cdef void free_activations(const ActivationsC* A) noexcept nogil:
|
||||||
free(A.is_valid)
|
free(A.is_valid)
|
||||||
|
|
||||||
|
|
||||||
cdef void resize_activations(ActivationsC* A, SizesC n) noexcept nogil:
|
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||||
if n.states <= A._max_size:
|
if n.states <= A._max_size:
|
||||||
A._curr_size = n.states
|
A._curr_size = n.states
|
||||||
return
|
return
|
||||||
|
@ -100,7 +100,7 @@ cdef void resize_activations(ActivationsC* A, SizesC n) noexcept nogil:
|
||||||
|
|
||||||
cdef void predict_states(
|
cdef void predict_states(
|
||||||
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
||||||
) noexcept nogil:
|
) nogil:
|
||||||
resize_activations(A, n)
|
resize_activations(A, n)
|
||||||
for i in range(n.states):
|
for i in range(n.states):
|
||||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||||
|
@ -159,7 +159,7 @@ cdef void sum_state_features(
|
||||||
int B,
|
int B,
|
||||||
int F,
|
int F,
|
||||||
int O
|
int O
|
||||||
) noexcept nogil:
|
) nogil:
|
||||||
cdef int idx, b, f
|
cdef int idx, b, f
|
||||||
cdef const float* feature
|
cdef const float* feature
|
||||||
padding = cached
|
padding = cached
|
||||||
|
@ -183,7 +183,7 @@ cdef void cpu_log_loss(
|
||||||
const int* is_valid,
|
const int* is_valid,
|
||||||
const float* scores,
|
const float* scores,
|
||||||
int O
|
int O
|
||||||
) noexcept nogil:
|
) nogil:
|
||||||
"""Do multi-label log loss"""
|
"""Do multi-label log loss"""
|
||||||
cdef double max_, gmax, Z, gZ
|
cdef double max_, gmax, Z, gZ
|
||||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||||
|
@ -209,7 +209,7 @@ cdef void cpu_log_loss(
|
||||||
|
|
||||||
cdef int arg_max_if_gold(
|
cdef int arg_max_if_gold(
|
||||||
const weight_t* scores, const weight_t* costs, const int* is_valid, int n
|
const weight_t* scores, const weight_t* costs, const int* is_valid, int n
|
||||||
) noexcept nogil:
|
) nogil:
|
||||||
# Find minimum cost
|
# Find minimum cost
|
||||||
cdef float cost = 1
|
cdef float cost = 1
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
|
@ -224,7 +224,7 @@ cdef int arg_max_if_gold(
|
||||||
return best
|
return best
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) noexcept nogil:
|
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
||||||
cdef int best = -1
|
cdef int best = -1
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
if is_valid[i] >= 1:
|
if is_valid[i] >= 1:
|
||||||
|
|
|
@ -13,6 +13,7 @@ from ..vectors import Mode, Vectors
|
||||||
from ..vocab import Vocab
|
from ..vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.StaticVectors.v2")
|
||||||
def StaticVectors(
|
def StaticVectors(
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
nM: Optional[int] = None,
|
nM: Optional[int] = None,
|
||||||
|
|
|
@ -4,6 +4,7 @@ from ..util import registry
|
||||||
from .parser_model import ParserStepModel
|
from .parser_model import ParserStepModel
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.TransitionModel.v1")
|
||||||
def TransitionModel(
|
def TransitionModel(
|
||||||
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
||||||
):
|
):
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user