mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 00:32:40 +03:00
Merge branch 'master' into pr/13418
This commit is contained in:
commit
37dd13a96b
92
.github/workflows/cibuildwheel.yml
vendored
Normal file
92
.github/workflows/cibuildwheel.yml
vendored
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
name: Build
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
# ytf did they invent their own syntax that's almost regex?
|
||||||
|
# ** matches 'zero or more of any character'
|
||||||
|
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
|
||||||
|
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
|
||||||
|
jobs:
|
||||||
|
build_wheels:
|
||||||
|
name: Build wheels on ${{ matrix.os }}
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
# macos-13 is an intel runner, macos-14 is apple silicon
|
||||||
|
os: [ubuntu-latest, windows-latest, macos-13]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Build wheels
|
||||||
|
uses: pypa/cibuildwheel@v2.19.1
|
||||||
|
env:
|
||||||
|
CIBW_SOME_OPTION: value
|
||||||
|
with:
|
||||||
|
package-dir: .
|
||||||
|
output-dir: wheelhouse
|
||||||
|
config-file: "{package}/pyproject.toml"
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
||||||
|
path: ./wheelhouse/*.whl
|
||||||
|
|
||||||
|
build_sdist:
|
||||||
|
name: Build source distribution
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Build sdist
|
||||||
|
run: pipx run build --sdist
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: cibw-sdist
|
||||||
|
path: dist/*.tar.gz
|
||||||
|
create_release:
|
||||||
|
needs: [build_wheels, build_sdist]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
checks: write
|
||||||
|
actions: read
|
||||||
|
issues: read
|
||||||
|
packages: write
|
||||||
|
pull-requests: read
|
||||||
|
repository-projects: read
|
||||||
|
statuses: read
|
||||||
|
steps:
|
||||||
|
- name: Get the tag name and determine if it's a prerelease
|
||||||
|
id: get_tag_info
|
||||||
|
run: |
|
||||||
|
FULL_TAG=${GITHUB_REF#refs/tags/}
|
||||||
|
if [[ $FULL_TAG == release-* ]]; then
|
||||||
|
TAG_NAME=${FULL_TAG#release-}
|
||||||
|
IS_PRERELEASE=false
|
||||||
|
elif [[ $FULL_TAG == prerelease-* ]]; then
|
||||||
|
TAG_NAME=${FULL_TAG#prerelease-}
|
||||||
|
IS_PRERELEASE=true
|
||||||
|
else
|
||||||
|
echo "Tag does not match expected patterns" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
|
||||||
|
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
|
||||||
|
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
# unpacks all CIBW artifacts into dist/
|
||||||
|
pattern: cibw-*
|
||||||
|
path: dist
|
||||||
|
merge-multiple: true
|
||||||
|
- name: Create Draft Release
|
||||||
|
id: create_release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
if: startsWith(github.ref, 'refs/tags/')
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
with:
|
||||||
|
name: ${{ env.TAG_NAME }}
|
||||||
|
draft: true
|
||||||
|
prerelease: ${{ env.IS_PRERELEASE }}
|
||||||
|
files: "./dist/*"
|
2
.github/workflows/explosionbot.yml
vendored
2
.github/workflows/explosionbot.yml
vendored
|
@ -15,7 +15,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
GITHUB_CONTEXT: ${{ toJson(github) }}
|
GITHUB_CONTEXT: ${{ toJson(github) }}
|
||||||
run: echo "$GITHUB_CONTEXT"
|
run: echo "$GITHUB_CONTEXT"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
- name: Install and run explosion-bot
|
- name: Install and run explosion-bot
|
||||||
run: |
|
run: |
|
||||||
|
|
2
.github/workflows/lock.yml
vendored
2
.github/workflows/lock.yml
vendored
|
@ -16,7 +16,7 @@ jobs:
|
||||||
if: github.repository_owner == 'explosion'
|
if: github.repository_owner == 'explosion'
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: dessant/lock-threads@v4
|
- uses: dessant/lock-threads@v5
|
||||||
with:
|
with:
|
||||||
process-only: 'issues'
|
process-only: 'issues'
|
||||||
issue-inactive-days: '30'
|
issue-inactive-days: '30'
|
||||||
|
|
29
.github/workflows/publish_pypi.yml
vendored
Normal file
29
.github/workflows/publish_pypi.yml
vendored
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# The cibuildwheel action triggers on creation of a release, this
|
||||||
|
# triggers on publication.
|
||||||
|
# The expected workflow is to create a draft release and let the wheels
|
||||||
|
# upload, and then hit 'publish', which uploads to PyPi.
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types:
|
||||||
|
- published
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
upload_pypi:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment:
|
||||||
|
name: pypi
|
||||||
|
url: https://pypi.org/p/spacy
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
if: github.event_name == 'release' && github.event.action == 'published'
|
||||||
|
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
|
||||||
|
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
||||||
|
steps:
|
||||||
|
- uses: robinraju/release-downloader@v1
|
||||||
|
with:
|
||||||
|
tag: ${{ github.event.release.tag_name }}
|
||||||
|
fileName: '*'
|
||||||
|
out-file-path: 'dist'
|
||||||
|
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@ -14,7 +14,7 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ matrix.branch }}
|
ref: ${{ matrix.branch }}
|
||||||
- name: Get commits from past 24 hours
|
- name: Get commits from past 24 hours
|
2
.github/workflows/spacy_universe_alert.yml
vendored
2
.github/workflows/spacy_universe_alert.yml
vendored
|
@ -18,7 +18,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "$GITHUB_CONTEXT"
|
echo "$GITHUB_CONTEXT"
|
||||||
|
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v4
|
- uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
|
|
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
|
@ -25,13 +25,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.7"
|
python-version: "3.7"
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: black
|
- name: black
|
||||||
run: |
|
run: |
|
||||||
|
@ -75,13 +74,12 @@ jobs:
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python_version }}
|
python-version: ${{ matrix.python_version }}
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
|
|
3
.github/workflows/universe_validation.yml
vendored
3
.github/workflows/universe_validation.yml
vendored
|
@ -20,13 +20,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Check out repo
|
- name: Check out repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Configure Python version
|
- name: Configure Python version
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: "3.7"
|
python-version: "3.7"
|
||||||
architecture: x64
|
|
||||||
|
|
||||||
- name: Validate website/meta/universe.json
|
- name: Validate website/meta/universe.json
|
||||||
run: |
|
run: |
|
||||||
|
|
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -11,5 +11,58 @@ requires = [
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.cibuildwheel]
|
||||||
|
build = "*"
|
||||||
|
skip = "pp* cp36* cp37* cp38* *-win32"
|
||||||
|
test-skip = ""
|
||||||
|
free-threaded-support = false
|
||||||
|
|
||||||
|
archs = ["native"]
|
||||||
|
|
||||||
|
build-frontend = "default"
|
||||||
|
config-settings = {}
|
||||||
|
dependency-versions = "pinned"
|
||||||
|
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
|
||||||
|
|
||||||
|
environment-pass = []
|
||||||
|
build-verbosity = 0
|
||||||
|
|
||||||
|
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
|
||||||
|
before-build = "pip install -r requirements.txt && python setup.py clean"
|
||||||
|
repair-wheel-command = ""
|
||||||
|
|
||||||
|
test-command = ""
|
||||||
|
before-test = ""
|
||||||
|
test-requires = []
|
||||||
|
test-extras = []
|
||||||
|
|
||||||
|
container-engine = "docker"
|
||||||
|
|
||||||
|
manylinux-x86_64-image = "manylinux2014"
|
||||||
|
manylinux-i686-image = "manylinux2014"
|
||||||
|
manylinux-aarch64-image = "manylinux2014"
|
||||||
|
manylinux-ppc64le-image = "manylinux2014"
|
||||||
|
manylinux-s390x-image = "manylinux2014"
|
||||||
|
manylinux-pypy_x86_64-image = "manylinux2014"
|
||||||
|
manylinux-pypy_i686-image = "manylinux2014"
|
||||||
|
manylinux-pypy_aarch64-image = "manylinux2014"
|
||||||
|
|
||||||
|
musllinux-x86_64-image = "musllinux_1_2"
|
||||||
|
musllinux-i686-image = "musllinux_1_2"
|
||||||
|
musllinux-aarch64-image = "musllinux_1_2"
|
||||||
|
musllinux-ppc64le-image = "musllinux_1_2"
|
||||||
|
musllinux-s390x-image = "musllinux_1_2"
|
||||||
|
|
||||||
|
[tool.cibuildwheel.linux]
|
||||||
|
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
|
||||||
|
|
||||||
|
[tool.cibuildwheel.macos]
|
||||||
|
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
|
||||||
|
|
||||||
|
[tool.cibuildwheel.windows]
|
||||||
|
|
||||||
|
[tool.cibuildwheel.pyodide]
|
||||||
|
|
||||||
|
|
||||||
[tool.isort]
|
[tool.isort]
|
||||||
profile = "black"
|
profile = "black"
|
||||||
|
|
|
@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
typer>=0.3.0,<0.10.0
|
typer>=0.3.0,<1.0.0
|
||||||
weasel>=0.1.0,<0.5.0
|
weasel>=0.1.0,<0.5.0
|
||||||
# Third party dependencies
|
# Third party dependencies
|
||||||
numpy>=1.15.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
|
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
pre-commit>=2.13.0
|
pre-commit>=2.13.0
|
||||||
cython>=0.25,<3.0
|
cython>=0.25,<3.0
|
||||||
|
|
|
@ -22,6 +22,7 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
Programming Language :: Python :: 3.10
|
Programming Language :: Python :: 3.10
|
||||||
Programming Language :: Python :: 3.11
|
Programming Language :: Python :: 3.11
|
||||||
|
Programming Language :: Python :: 3.12
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -55,7 +56,7 @@ install_requires =
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
weasel>=0.1.0,<0.5.0
|
weasel>=0.1.0,<0.5.0
|
||||||
# Third-party dependencies
|
# Third-party dependencies
|
||||||
typer>=0.3.0,<0.10.0
|
typer>=0.3.0,<1.0.0
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
numpy>=1.15.0; python_version < "3.9"
|
numpy>=1.15.0; python_version < "3.9"
|
||||||
numpy>=1.19.0; python_version >= "3.9"
|
numpy>=1.19.0; python_version >= "3.9"
|
||||||
|
@ -65,7 +66,6 @@ install_requires =
|
||||||
# Official Python utilities
|
# Official Python utilities
|
||||||
setuptools
|
setuptools
|
||||||
packaging>=20.0
|
packaging>=20.0
|
||||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
|
||||||
langcodes>=3.2.0,<4.0.0
|
langcodes>=3.2.0,<4.0.0
|
||||||
|
|
||||||
[options.entry_points]
|
[options.entry_points]
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.7.4"
|
__version__ = "3.8.0.dev0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -39,7 +39,7 @@ def find_threshold_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
@ -81,7 +81,7 @@ def find_threshold(
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Tuple[float, float, Dict[float, float]]:
|
) -> Tuple[float, float, Dict[float, float]]:
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
Runs prediction trials for models with varying thresholds to maximize the specified metric.
|
||||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||||
pipe_name (str): Name of pipe to examine thresholds for.
|
pipe_name (str): Name of pipe to examine thresholds for.
|
||||||
|
|
16
spacy/lang/bo/__init__.py
Normal file
16
spacy/lang/bo/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class TibetanDefaults(BaseDefaults):
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Tibetan(Language):
|
||||||
|
lang = "bo"
|
||||||
|
Defaults = TibetanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Tibetan"]
|
16
spacy/lang/bo/examples.py
Normal file
16
spacy/lang/bo/examples.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.bo.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
|
||||||
|
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
|
||||||
|
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
|
||||||
|
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
|
||||||
|
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
|
||||||
|
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
|
||||||
|
]
|
65
spacy/lang/bo/lex_attrs.py
Normal file
65
spacy/lang/bo/lex_attrs.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"ཀླད་ཀོར་",
|
||||||
|
"གཅིག་",
|
||||||
|
"གཉིས་",
|
||||||
|
"གསུམ་",
|
||||||
|
"བཞི་",
|
||||||
|
"ལྔ་",
|
||||||
|
"དྲུག་",
|
||||||
|
"བདུན་",
|
||||||
|
"བརྒྱད་",
|
||||||
|
"དགུ་",
|
||||||
|
"བཅུ་",
|
||||||
|
"བཅུ་གཅིག་",
|
||||||
|
"བཅུ་གཉིས་",
|
||||||
|
"བཅུ་གསུམ་",
|
||||||
|
"བཅུ་བཞི་",
|
||||||
|
"བཅུ་ལྔ་",
|
||||||
|
"བཅུ་དྲུག་",
|
||||||
|
"བཅུ་བདུན་",
|
||||||
|
"བཅུ་པརྒྱད",
|
||||||
|
"བཅུ་དགུ་",
|
||||||
|
"ཉི་ཤུ་",
|
||||||
|
"སུམ་ཅུ",
|
||||||
|
"བཞི་བཅུ",
|
||||||
|
"ལྔ་བཅུ",
|
||||||
|
"དྲུག་ཅུ",
|
||||||
|
"བདུན་ཅུ",
|
||||||
|
"བརྒྱད་ཅུ",
|
||||||
|
"དགུ་བཅུ",
|
||||||
|
"བརྒྱ་",
|
||||||
|
"སྟོང་",
|
||||||
|
"ཁྲི་",
|
||||||
|
"ས་ཡ་",
|
||||||
|
" བྱེ་བ་",
|
||||||
|
"དུང་ཕྱུར་",
|
||||||
|
"ཐེར་འབུམ་",
|
||||||
|
"ཐེར་འབུམ་ཆེན་པོ་",
|
||||||
|
"ཁྲག་ཁྲིག་",
|
||||||
|
"ཁྲག་ཁྲིག་ཆེན་པོ་",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
"""
|
||||||
|
Check if text resembles a number
|
||||||
|
"""
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
if text in _num_words:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
198
spacy/lang/bo/stop_words.py
Normal file
198
spacy/lang/bo/stop_words.py
Normal file
|
@ -0,0 +1,198 @@
|
||||||
|
# Source: https://zenodo.org/records/10148636
|
||||||
|
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
འི་
|
||||||
|
།
|
||||||
|
དུ་
|
||||||
|
གིས་
|
||||||
|
སོགས་
|
||||||
|
ཏེ
|
||||||
|
གི་
|
||||||
|
རྣམས་
|
||||||
|
ནི
|
||||||
|
ཀུན་
|
||||||
|
ཡི་
|
||||||
|
འདི
|
||||||
|
ཀྱི་
|
||||||
|
སྙེད་
|
||||||
|
པས་
|
||||||
|
གཞན་
|
||||||
|
ཀྱིས་
|
||||||
|
ཡི
|
||||||
|
ལ
|
||||||
|
ནི་
|
||||||
|
དང་
|
||||||
|
སོགས
|
||||||
|
ཅིང་
|
||||||
|
ར
|
||||||
|
དུ
|
||||||
|
མི་
|
||||||
|
སུ་
|
||||||
|
བཅས་
|
||||||
|
ཡོངས་
|
||||||
|
ལས
|
||||||
|
ཙམ་
|
||||||
|
གྱིས་
|
||||||
|
དེ་
|
||||||
|
ཡང་
|
||||||
|
མཐའ་དག་
|
||||||
|
ཏུ་
|
||||||
|
ཉིད་
|
||||||
|
ས
|
||||||
|
ཏེ་
|
||||||
|
གྱི་
|
||||||
|
སྤྱི
|
||||||
|
དེ
|
||||||
|
ཀ་
|
||||||
|
ཡིན་
|
||||||
|
ཞིང་
|
||||||
|
འདི་
|
||||||
|
རུང་
|
||||||
|
རང་
|
||||||
|
ཞིག་
|
||||||
|
སྟེ
|
||||||
|
སྟེ་
|
||||||
|
ན་རེ
|
||||||
|
ངམ
|
||||||
|
ཤིང་
|
||||||
|
དག་
|
||||||
|
ཏོ
|
||||||
|
རེ་
|
||||||
|
འང་
|
||||||
|
ཀྱང་
|
||||||
|
ལགས་པ
|
||||||
|
ཚུ
|
||||||
|
དོ
|
||||||
|
ཡིན་པ
|
||||||
|
རེ
|
||||||
|
ན་རེ་
|
||||||
|
ཨེ་
|
||||||
|
ཚང་མ
|
||||||
|
ཐམས་ཅད་
|
||||||
|
དམ་
|
||||||
|
འོ་
|
||||||
|
ཅིག་
|
||||||
|
གྱིན་
|
||||||
|
ཡིན
|
||||||
|
ན
|
||||||
|
ཁོ་ན་
|
||||||
|
འམ་
|
||||||
|
ཀྱིན་
|
||||||
|
ལོ
|
||||||
|
ཀྱིས
|
||||||
|
བས་
|
||||||
|
ལགས་
|
||||||
|
ཤིག
|
||||||
|
གིས
|
||||||
|
ཀི་
|
||||||
|
སྣ་ཚོགས་
|
||||||
|
རྣམས
|
||||||
|
སྙེད་པ
|
||||||
|
ཡིས་
|
||||||
|
གྱི
|
||||||
|
གི
|
||||||
|
བམ་
|
||||||
|
ཤིག་
|
||||||
|
རེ་རེ་
|
||||||
|
ནམ
|
||||||
|
མིན་
|
||||||
|
ནམ་
|
||||||
|
ངམ་
|
||||||
|
རུ་
|
||||||
|
འགའ་
|
||||||
|
ཀུན
|
||||||
|
ཤས་
|
||||||
|
ཏུ
|
||||||
|
ཡིས
|
||||||
|
གིན་
|
||||||
|
གམ་
|
||||||
|
འོ
|
||||||
|
ཡིན་པ་
|
||||||
|
མིན
|
||||||
|
ལགས
|
||||||
|
གྱིས
|
||||||
|
ཅང་
|
||||||
|
འགའ
|
||||||
|
སམ་
|
||||||
|
ཞིག
|
||||||
|
འང
|
||||||
|
ལས་ཆེ་
|
||||||
|
འཕྲལ་
|
||||||
|
བར་
|
||||||
|
རུ
|
||||||
|
དང
|
||||||
|
ཡ
|
||||||
|
འག
|
||||||
|
སམ
|
||||||
|
ཀ
|
||||||
|
ཅུང་ཟད་
|
||||||
|
ཅིག
|
||||||
|
ཉིད
|
||||||
|
དུ་མ
|
||||||
|
མ
|
||||||
|
ཡིན་བ
|
||||||
|
འམ
|
||||||
|
མམ
|
||||||
|
དམ
|
||||||
|
དག
|
||||||
|
ཁོ་ན
|
||||||
|
ཀྱི
|
||||||
|
ལམ
|
||||||
|
ཕྱི་
|
||||||
|
ནང་
|
||||||
|
ཙམ
|
||||||
|
ནོ་
|
||||||
|
སོ་
|
||||||
|
རམ་
|
||||||
|
བོ་
|
||||||
|
ཨང་
|
||||||
|
ཕྱི
|
||||||
|
ཏོ་
|
||||||
|
ཚོ
|
||||||
|
ལ་ལ་
|
||||||
|
ཚོ་
|
||||||
|
ཅིང
|
||||||
|
མ་གི་
|
||||||
|
གེ
|
||||||
|
གོ
|
||||||
|
ཡིན་ལུགས་
|
||||||
|
རོ་
|
||||||
|
བོ
|
||||||
|
ལགས་པ་
|
||||||
|
པས
|
||||||
|
རབ་
|
||||||
|
འི
|
||||||
|
རམ
|
||||||
|
བས
|
||||||
|
གཞན
|
||||||
|
སྙེད་པ་
|
||||||
|
འབའ་
|
||||||
|
མཾ་
|
||||||
|
པོ
|
||||||
|
ག་
|
||||||
|
ག
|
||||||
|
གམ
|
||||||
|
སྤྱི་
|
||||||
|
བམ
|
||||||
|
མོ་
|
||||||
|
ཙམ་པ་
|
||||||
|
ཤ་སྟག་
|
||||||
|
མམ་
|
||||||
|
རེ་རེ
|
||||||
|
སྙེད
|
||||||
|
ཏམ་
|
||||||
|
ངོ
|
||||||
|
གྲང་
|
||||||
|
ཏ་རེ
|
||||||
|
ཏམ
|
||||||
|
ཁ་
|
||||||
|
ངེ་
|
||||||
|
ཅོག་
|
||||||
|
རིལ་
|
||||||
|
ཉུང་ཤས་
|
||||||
|
གིང་
|
||||||
|
ཚ་
|
||||||
|
ཀྱང
|
||||||
|
""".split()
|
||||||
|
)
|
18
spacy/lang/gd/__init__.py
Normal file
18
spacy/lang/gd/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
|
|
||||||
|
|
||||||
|
class ScottishDefaults(BaseDefaults):
|
||||||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class Scottish(Language):
|
||||||
|
lang = "gd"
|
||||||
|
Defaults = ScottishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Scottish"]
|
388
spacy/lang/gd/stop_words.py
Normal file
388
spacy/lang/gd/stop_words.py
Normal file
|
@ -0,0 +1,388 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
'ad
|
||||||
|
'ar
|
||||||
|
'd # iad
|
||||||
|
'g # ag
|
||||||
|
'ga
|
||||||
|
'gam
|
||||||
|
'gan
|
||||||
|
'gar
|
||||||
|
'gur
|
||||||
|
'm # am
|
||||||
|
'n # an
|
||||||
|
'n seo
|
||||||
|
'na
|
||||||
|
'nad
|
||||||
|
'nam
|
||||||
|
'nan
|
||||||
|
'nar
|
||||||
|
'nuair
|
||||||
|
'nur
|
||||||
|
's
|
||||||
|
'sa
|
||||||
|
'san
|
||||||
|
'sann
|
||||||
|
'se
|
||||||
|
'sna
|
||||||
|
a
|
||||||
|
a'
|
||||||
|
a'd # agad
|
||||||
|
a'm # agam
|
||||||
|
a-chèile
|
||||||
|
a-seo
|
||||||
|
a-sin
|
||||||
|
a-siud
|
||||||
|
a chionn
|
||||||
|
a chionn 's
|
||||||
|
a chèile
|
||||||
|
a chéile
|
||||||
|
a dh'
|
||||||
|
a h-uile
|
||||||
|
a seo
|
||||||
|
ac' # aca
|
||||||
|
aca
|
||||||
|
aca-san
|
||||||
|
acasan
|
||||||
|
ach
|
||||||
|
ag
|
||||||
|
agad
|
||||||
|
agad-sa
|
||||||
|
agads'
|
||||||
|
agadsa
|
||||||
|
agaibh
|
||||||
|
agaibhse
|
||||||
|
againn
|
||||||
|
againne
|
||||||
|
agam
|
||||||
|
agam-sa
|
||||||
|
agams'
|
||||||
|
agamsa
|
||||||
|
agus
|
||||||
|
aice
|
||||||
|
aice-se
|
||||||
|
aicese
|
||||||
|
aig
|
||||||
|
aig' # aige
|
||||||
|
aige
|
||||||
|
aige-san
|
||||||
|
aigesan
|
||||||
|
air
|
||||||
|
air-san
|
||||||
|
air neo
|
||||||
|
airsan
|
||||||
|
am
|
||||||
|
an
|
||||||
|
an seo
|
||||||
|
an sin
|
||||||
|
an siud
|
||||||
|
an uair
|
||||||
|
ann
|
||||||
|
ann a
|
||||||
|
ann a'
|
||||||
|
ann a shin
|
||||||
|
ann am
|
||||||
|
ann an
|
||||||
|
annad
|
||||||
|
annam
|
||||||
|
annam-s'
|
||||||
|
annamsa
|
||||||
|
anns
|
||||||
|
anns an
|
||||||
|
annta
|
||||||
|
aon
|
||||||
|
ar
|
||||||
|
as
|
||||||
|
asad
|
||||||
|
asda
|
||||||
|
asta
|
||||||
|
b'
|
||||||
|
bho
|
||||||
|
bhon
|
||||||
|
bhuaidhe # bhuaithe
|
||||||
|
bhuainn
|
||||||
|
bhuaipe
|
||||||
|
bhuaithe
|
||||||
|
bhuapa
|
||||||
|
bhur
|
||||||
|
brì
|
||||||
|
bu
|
||||||
|
c'à
|
||||||
|
car son
|
||||||
|
carson
|
||||||
|
cha
|
||||||
|
chan
|
||||||
|
chionn
|
||||||
|
choir
|
||||||
|
chon
|
||||||
|
chun
|
||||||
|
chèile
|
||||||
|
chéile
|
||||||
|
chòir
|
||||||
|
cia mheud
|
||||||
|
ciamar
|
||||||
|
co-dhiubh
|
||||||
|
cuide
|
||||||
|
cuin
|
||||||
|
cuin'
|
||||||
|
cuine
|
||||||
|
cà
|
||||||
|
cà'
|
||||||
|
càil
|
||||||
|
càit
|
||||||
|
càit'
|
||||||
|
càite
|
||||||
|
cò
|
||||||
|
cò mheud
|
||||||
|
có
|
||||||
|
d'
|
||||||
|
da
|
||||||
|
de
|
||||||
|
dh'
|
||||||
|
dha
|
||||||
|
dhaibh
|
||||||
|
dhaibh-san
|
||||||
|
dhaibhsan
|
||||||
|
dhan
|
||||||
|
dhasan
|
||||||
|
dhe
|
||||||
|
dhen
|
||||||
|
dheth
|
||||||
|
dhi
|
||||||
|
dhiom
|
||||||
|
dhiot
|
||||||
|
dhith
|
||||||
|
dhiubh
|
||||||
|
dhomh
|
||||||
|
dhomh-s'
|
||||||
|
dhomhsa
|
||||||
|
dhu'sa # dhut-sa
|
||||||
|
dhuibh
|
||||||
|
dhuibhse
|
||||||
|
dhuinn
|
||||||
|
dhuinne
|
||||||
|
dhuit
|
||||||
|
dhut
|
||||||
|
dhutsa
|
||||||
|
dhut-sa
|
||||||
|
dhà
|
||||||
|
dhà-san
|
||||||
|
dhàsan
|
||||||
|
dhòmhsa
|
||||||
|
diubh
|
||||||
|
do
|
||||||
|
docha
|
||||||
|
don
|
||||||
|
dà
|
||||||
|
dè
|
||||||
|
dè mar
|
||||||
|
dé
|
||||||
|
dé mar
|
||||||
|
dòch'
|
||||||
|
dòcha
|
||||||
|
e
|
||||||
|
eadar
|
||||||
|
eatarra
|
||||||
|
eatorra
|
||||||
|
eile
|
||||||
|
esan
|
||||||
|
fa
|
||||||
|
far
|
||||||
|
feud
|
||||||
|
fhad
|
||||||
|
fheudar
|
||||||
|
fhearr
|
||||||
|
fhein
|
||||||
|
fheudar
|
||||||
|
fheàrr
|
||||||
|
fhèin
|
||||||
|
fhéin
|
||||||
|
fhìn
|
||||||
|
fo
|
||||||
|
fodha
|
||||||
|
fodhainn
|
||||||
|
foipe
|
||||||
|
fon
|
||||||
|
fèin
|
||||||
|
ga
|
||||||
|
gach
|
||||||
|
gam
|
||||||
|
gan
|
||||||
|
ge brith
|
||||||
|
ged
|
||||||
|
gu
|
||||||
|
gu dè
|
||||||
|
gu ruige
|
||||||
|
gun
|
||||||
|
gur
|
||||||
|
gus
|
||||||
|
i
|
||||||
|
iad
|
||||||
|
iadsan
|
||||||
|
innte
|
||||||
|
is
|
||||||
|
ise
|
||||||
|
le
|
||||||
|
leam
|
||||||
|
leam-sa
|
||||||
|
leamsa
|
||||||
|
leat
|
||||||
|
leat-sa
|
||||||
|
leatha
|
||||||
|
leatsa
|
||||||
|
leibh
|
||||||
|
leis
|
||||||
|
leis-san
|
||||||
|
leoth'
|
||||||
|
leotha
|
||||||
|
leotha-san
|
||||||
|
linn
|
||||||
|
m'
|
||||||
|
m'a
|
||||||
|
ma
|
||||||
|
mac
|
||||||
|
man
|
||||||
|
mar
|
||||||
|
mas
|
||||||
|
mathaid
|
||||||
|
mi
|
||||||
|
mis'
|
||||||
|
mise
|
||||||
|
mo
|
||||||
|
mu
|
||||||
|
mu 'n
|
||||||
|
mun
|
||||||
|
mur
|
||||||
|
mura
|
||||||
|
mus
|
||||||
|
na
|
||||||
|
na b'
|
||||||
|
na bu
|
||||||
|
na iad
|
||||||
|
nach
|
||||||
|
nad
|
||||||
|
nam
|
||||||
|
nan
|
||||||
|
nar
|
||||||
|
nas
|
||||||
|
neo
|
||||||
|
no
|
||||||
|
nuair
|
||||||
|
o
|
||||||
|
o'n
|
||||||
|
oir
|
||||||
|
oirbh
|
||||||
|
oirbh-se
|
||||||
|
oirnn
|
||||||
|
oirnne
|
||||||
|
oirre
|
||||||
|
on
|
||||||
|
orm
|
||||||
|
orm-sa
|
||||||
|
ormsa
|
||||||
|
orra
|
||||||
|
orra-san
|
||||||
|
orrasan
|
||||||
|
ort
|
||||||
|
os
|
||||||
|
r'
|
||||||
|
ri
|
||||||
|
ribh
|
||||||
|
rinn
|
||||||
|
ris
|
||||||
|
rithe
|
||||||
|
rithe-se
|
||||||
|
rium
|
||||||
|
rium-sa
|
||||||
|
riums'
|
||||||
|
riumsa
|
||||||
|
riut
|
||||||
|
riuth'
|
||||||
|
riutha
|
||||||
|
riuthasan
|
||||||
|
ro
|
||||||
|
ro'n
|
||||||
|
roimh
|
||||||
|
roimhe
|
||||||
|
romhainn
|
||||||
|
romham
|
||||||
|
romhpa
|
||||||
|
ron
|
||||||
|
ruibh
|
||||||
|
ruinn
|
||||||
|
ruinne
|
||||||
|
sa
|
||||||
|
san
|
||||||
|
sann
|
||||||
|
se
|
||||||
|
seach
|
||||||
|
seo
|
||||||
|
seothach
|
||||||
|
shin
|
||||||
|
sibh
|
||||||
|
sibh-se
|
||||||
|
sibhse
|
||||||
|
sin
|
||||||
|
sineach
|
||||||
|
sinn
|
||||||
|
sinne
|
||||||
|
siod
|
||||||
|
siodach
|
||||||
|
siud
|
||||||
|
siudach
|
||||||
|
sna # ann an
|
||||||
|
sè
|
||||||
|
t'
|
||||||
|
tarsaing
|
||||||
|
tarsainn
|
||||||
|
tarsuinn
|
||||||
|
thar
|
||||||
|
thoigh
|
||||||
|
thro
|
||||||
|
thu
|
||||||
|
thuc'
|
||||||
|
thuca
|
||||||
|
thugad
|
||||||
|
thugaibh
|
||||||
|
thugainn
|
||||||
|
thugam
|
||||||
|
thugamsa
|
||||||
|
thuice
|
||||||
|
thuige
|
||||||
|
thus'
|
||||||
|
thusa
|
||||||
|
timcheall
|
||||||
|
toigh
|
||||||
|
toil
|
||||||
|
tro
|
||||||
|
tro' # troimh
|
||||||
|
troimh
|
||||||
|
troimhe
|
||||||
|
tron
|
||||||
|
tu
|
||||||
|
tusa
|
||||||
|
uair
|
||||||
|
ud
|
||||||
|
ugaibh
|
||||||
|
ugam-s'
|
||||||
|
ugam-sa
|
||||||
|
uice
|
||||||
|
uige
|
||||||
|
uige-san
|
||||||
|
umad
|
||||||
|
unnta # ann an
|
||||||
|
ur
|
||||||
|
urrainn
|
||||||
|
à
|
||||||
|
às
|
||||||
|
àsan
|
||||||
|
á
|
||||||
|
ás
|
||||||
|
è
|
||||||
|
ì
|
||||||
|
ò
|
||||||
|
ó
|
||||||
|
""".split(
|
||||||
|
"\n"
|
||||||
|
)
|
||||||
|
)
|
1983
spacy/lang/gd/tokenizer_exceptions.py
Normal file
1983
spacy/lang/gd/tokenizer_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
16
spacy/lang/kmr/__init__.py
Normal file
16
spacy/lang/kmr/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from ...language import BaseDefaults, Language
|
||||||
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
|
|
||||||
|
class KurmanjiDefaults(BaseDefaults):
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
|
||||||
|
|
||||||
|
class Kurmanji(Language):
|
||||||
|
lang = "kmr"
|
||||||
|
Defaults = KurmanjiDefaults
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Kurmanji"]
|
17
spacy/lang/kmr/examples.py
Normal file
17
spacy/lang/kmr/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
"""
|
||||||
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
>>> from spacy.lang.kmr.examples import sentences
|
||||||
|
>>> docs = nlp.pipe(sentences)
|
||||||
|
"""
|
||||||
|
|
||||||
|
sentences = [
|
||||||
|
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
|
||||||
|
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
|
||||||
|
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
|
||||||
|
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
|
||||||
|
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
|
||||||
|
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
|
||||||
|
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
|
||||||
|
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
|
||||||
|
]
|
138
spacy/lang/kmr/lex_attrs.py
Normal file
138
spacy/lang/kmr/lex_attrs.py
Normal file
|
@ -0,0 +1,138 @@
|
||||||
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
_num_words = [
|
||||||
|
"sifir",
|
||||||
|
"yek",
|
||||||
|
"du",
|
||||||
|
"sê",
|
||||||
|
"çar",
|
||||||
|
"pênc",
|
||||||
|
"şeş",
|
||||||
|
"heft",
|
||||||
|
"heşt",
|
||||||
|
"neh",
|
||||||
|
"deh",
|
||||||
|
"yazde",
|
||||||
|
"dazde",
|
||||||
|
"sêzde",
|
||||||
|
"çarde",
|
||||||
|
"pazde",
|
||||||
|
"şazde",
|
||||||
|
"hevde",
|
||||||
|
"hejde",
|
||||||
|
"nozde",
|
||||||
|
"bîst",
|
||||||
|
"sî",
|
||||||
|
"çil",
|
||||||
|
"pêncî",
|
||||||
|
"şêst",
|
||||||
|
"heftê",
|
||||||
|
"heştê",
|
||||||
|
"nod",
|
||||||
|
"sed",
|
||||||
|
"hezar",
|
||||||
|
"milyon",
|
||||||
|
"milyar",
|
||||||
|
]
|
||||||
|
|
||||||
|
_ordinal_words = [
|
||||||
|
"yekem",
|
||||||
|
"yekemîn",
|
||||||
|
"duyem",
|
||||||
|
"duyemîn",
|
||||||
|
"sêyem",
|
||||||
|
"sêyemîn",
|
||||||
|
"çarem",
|
||||||
|
"çaremîn",
|
||||||
|
"pêncem",
|
||||||
|
"pêncemîn",
|
||||||
|
"şeşem",
|
||||||
|
"şeşemîn",
|
||||||
|
"heftem",
|
||||||
|
"heftemîn",
|
||||||
|
"heştem",
|
||||||
|
"heştemîn",
|
||||||
|
"nehem",
|
||||||
|
"nehemîn",
|
||||||
|
"dehem",
|
||||||
|
"dehemîn",
|
||||||
|
"yazdehem",
|
||||||
|
"yazdehemîn",
|
||||||
|
"dazdehem",
|
||||||
|
"dazdehemîn",
|
||||||
|
"sêzdehem",
|
||||||
|
"sêzdehemîn",
|
||||||
|
"çardehem",
|
||||||
|
"çardehemîn",
|
||||||
|
"pazdehem",
|
||||||
|
"pazdehemîn",
|
||||||
|
"şanzdehem",
|
||||||
|
"şanzdehemîn",
|
||||||
|
"hevdehem",
|
||||||
|
"hevdehemîn",
|
||||||
|
"hejdehem",
|
||||||
|
"hejdehemîn",
|
||||||
|
"nozdehem",
|
||||||
|
"nozdehemîn",
|
||||||
|
"bîstem",
|
||||||
|
"bîstemîn",
|
||||||
|
"sîyem",
|
||||||
|
"sîyemîn",
|
||||||
|
"çilem",
|
||||||
|
"çilemîn",
|
||||||
|
"pêncîyem",
|
||||||
|
"pênciyemîn",
|
||||||
|
"şêstem",
|
||||||
|
"şêstemîn",
|
||||||
|
"heftêyem",
|
||||||
|
"heftêyemîn",
|
||||||
|
"heştêyem",
|
||||||
|
"heştêyemîn",
|
||||||
|
"notem",
|
||||||
|
"notemîn",
|
||||||
|
"sedem",
|
||||||
|
"sedemîn",
|
||||||
|
"hezarem",
|
||||||
|
"hezaremîn",
|
||||||
|
"milyonem",
|
||||||
|
"milyonemîn",
|
||||||
|
"milyarem",
|
||||||
|
"milyaremîn",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
if text.startswith(("+", "-", "±", "~")):
|
||||||
|
text = text[1:]
|
||||||
|
text = text.replace(",", "").replace(".", "")
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count("/") == 1:
|
||||||
|
num, denom = text.split("/")
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
text_lower = text.lower()
|
||||||
|
if text_lower in _num_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check ordinal number
|
||||||
|
if text_lower in _ordinal_words:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if is_digit(text_lower):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_digit(text):
|
||||||
|
endings = ("em", "yem", "emîn", "yemîn")
|
||||||
|
for ending in endings:
|
||||||
|
to = len(ending)
|
||||||
|
if text.endswith(ending) and text[:-to].isdigit():
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
44
spacy/lang/kmr/stop_words.py
Normal file
44
spacy/lang/kmr/stop_words.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
|
û
|
||||||
|
li
|
||||||
|
bi
|
||||||
|
di
|
||||||
|
da
|
||||||
|
de
|
||||||
|
ji
|
||||||
|
ku
|
||||||
|
ew
|
||||||
|
ez
|
||||||
|
tu
|
||||||
|
em
|
||||||
|
hûn
|
||||||
|
ew
|
||||||
|
ev
|
||||||
|
min
|
||||||
|
te
|
||||||
|
wî
|
||||||
|
wê
|
||||||
|
me
|
||||||
|
we
|
||||||
|
wan
|
||||||
|
vê
|
||||||
|
vî
|
||||||
|
va
|
||||||
|
çi
|
||||||
|
kî
|
||||||
|
kê
|
||||||
|
çawa
|
||||||
|
çima
|
||||||
|
kengî
|
||||||
|
li ku
|
||||||
|
çend
|
||||||
|
çiqas
|
||||||
|
her
|
||||||
|
hin
|
||||||
|
gelek
|
||||||
|
hemû
|
||||||
|
kes
|
||||||
|
tişt
|
||||||
|
""".split()
|
||||||
|
)
|
|
@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
|
||||||
if lookups is None:
|
|
||||||
lookups = Lookups()
|
|
||||||
return MacedonianLemmatizer(lookups)
|
|
||||||
|
|
||||||
|
|
||||||
class Macedonian(Language):
|
class Macedonian(Language):
|
||||||
lang = "mk"
|
lang = "mk"
|
||||||
Defaults = MacedonianDefaults
|
Defaults = MacedonianDefaults
|
||||||
|
|
|
@ -5,7 +5,7 @@ import multiprocessing as mp
|
||||||
import random
|
import random
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import contextmanager
|
from contextlib import ExitStack, contextmanager
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import chain, cycle
|
from itertools import chain, cycle
|
||||||
|
@ -31,6 +31,7 @@ from typing import (
|
||||||
)
|
)
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
from cymem.cymem import Pool
|
||||||
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
||||||
|
|
||||||
from . import about, ty, util
|
from . import about, ty, util
|
||||||
|
@ -2091,6 +2092,38 @@ class Language:
|
||||||
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
||||||
tok2vec.remove_listener(listener, pipe_name)
|
tok2vec.remove_listener(listener, pipe_name)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||||
|
"""Begin a block where all resources allocated during the block will
|
||||||
|
be freed at the end of it. If a resources was created within the
|
||||||
|
memory zone block, accessing it outside the block is invalid.
|
||||||
|
Behaviour of this invalid access is undefined. Memory zones should
|
||||||
|
not be nested.
|
||||||
|
|
||||||
|
The memory zone is helpful for services that need to process large
|
||||||
|
volumes of text with a defined memory budget.
|
||||||
|
|
||||||
|
Example
|
||||||
|
-------
|
||||||
|
>>> with nlp.memory_zone():
|
||||||
|
... for doc in nlp.pipe(texts):
|
||||||
|
... process_my_doc(doc)
|
||||||
|
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
|
||||||
|
"""
|
||||||
|
if mem is None:
|
||||||
|
mem = Pool()
|
||||||
|
# The ExitStack allows programmatic nested context managers.
|
||||||
|
# We don't know how many we need, so it would be awkward to have
|
||||||
|
# them as nested blocks.
|
||||||
|
with ExitStack() as stack:
|
||||||
|
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
|
||||||
|
if hasattr(self.tokenizer, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
|
||||||
|
for _, pipe in self.pipeline:
|
||||||
|
if hasattr(pipe, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
|
||||||
|
yield mem
|
||||||
|
|
||||||
def to_disk(
|
def to_disk(
|
||||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
245
spacy/lexeme.pyx
245
spacy/lexeme.pyx
|
@ -164,41 +164,44 @@ cdef class Lexeme:
|
||||||
vector = self.vector
|
vector = self.vector
|
||||||
return numpy.sqrt((vector**2).sum())
|
return numpy.sqrt((vector**2).sum())
|
||||||
|
|
||||||
property vector:
|
@property
|
||||||
|
def vector(self):
|
||||||
"""A real-valued meaning representation.
|
"""A real-valued meaning representation.
|
||||||
|
|
||||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||||
representing the lexeme's semantics.
|
representing the lexeme's semantics.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
cdef int length = self.vocab.vectors_length
|
cdef int length = self.vocab.vectors_length
|
||||||
if length == 0:
|
if length == 0:
|
||||||
raise ValueError(Errors.E010)
|
raise ValueError(Errors.E010)
|
||||||
return self.vocab.get_vector(self.c.orth)
|
return self.vocab.get_vector(self.c.orth)
|
||||||
|
|
||||||
def __set__(self, vector):
|
@vector.setter
|
||||||
|
def vector(self, vector):
|
||||||
if len(vector) != self.vocab.vectors_length:
|
if len(vector) != self.vocab.vectors_length:
|
||||||
raise ValueError(Errors.E073.format(new_length=len(vector),
|
raise ValueError(Errors.E073.format(new_length=len(vector),
|
||||||
length=self.vocab.vectors_length))
|
length=self.vocab.vectors_length))
|
||||||
self.vocab.set_vector(self.c.orth, vector)
|
self.vocab.set_vector(self.c.orth, vector)
|
||||||
|
|
||||||
property rank:
|
@property
|
||||||
|
def rank(self):
|
||||||
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
|
||||||
to index into tables, e.g. for word vectors."""
|
to index into tables, e.g. for word vectors."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
|
||||||
def __set__(self, value):
|
@rank.setter
|
||||||
|
def rank(self, value):
|
||||||
self.c.id = value
|
self.c.id = value
|
||||||
|
|
||||||
property sentiment:
|
@property
|
||||||
|
def sentiment(self):
|
||||||
"""RETURNS (float): A scalar value indicating the positivity or
|
"""RETURNS (float): A scalar value indicating the positivity or
|
||||||
negativity of the lexeme."""
|
negativity of the lexeme."""
|
||||||
def __get__(self):
|
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
|
||||||
return sentiment_table.get(self.c.orth, 0.0)
|
return sentiment_table.get(self.c.orth, 0.0)
|
||||||
|
|
||||||
def __set__(self, float x):
|
@sentiment.setter
|
||||||
|
def sentiment(self, float x):
|
||||||
if "lexeme_sentiment" not in self.vocab.lookups:
|
if "lexeme_sentiment" not in self.vocab.lookups:
|
||||||
self.vocab.lookups.add_table("lexeme_sentiment")
|
self.vocab.lookups.add_table("lexeme_sentiment")
|
||||||
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
|
||||||
|
@ -216,151 +219,166 @@ cdef class Lexeme:
|
||||||
"""RETURNS (str): The original verbatim text of the lexeme."""
|
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||||||
return self.orth_
|
return self.orth_
|
||||||
|
|
||||||
property lower:
|
@property
|
||||||
|
def lower(self):
|
||||||
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
"""RETURNS (uint64): Lowercase form of the lexeme."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.lower
|
return self.c.lower
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@lower.setter
|
||||||
|
def lower(self, attr_t x):
|
||||||
self.c.lower = x
|
self.c.lower = x
|
||||||
|
|
||||||
property norm:
|
@property
|
||||||
|
def norm(self):
|
||||||
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.c.norm
|
return self.c.norm
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@norm.setter
|
||||||
|
def norm(self, attr_t x):
|
||||||
if "lexeme_norm" not in self.vocab.lookups:
|
if "lexeme_norm" not in self.vocab.lookups:
|
||||||
self.vocab.lookups.add_table("lexeme_norm")
|
self.vocab.lookups.add_table("lexeme_norm")
|
||||||
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
norm_table = self.vocab.lookups.get_table("lexeme_norm")
|
||||||
norm_table[self.c.orth] = self.vocab.strings[x]
|
norm_table[self.c.orth] = self.vocab.strings[x]
|
||||||
self.c.norm = x
|
self.c.norm = x
|
||||||
|
|
||||||
property shape:
|
@property
|
||||||
|
def shape(self):
|
||||||
"""RETURNS (uint64): Transform of the word's string, to show
|
"""RETURNS (uint64): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.c.shape
|
return self.c.shape
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@shape.setter
|
||||||
|
def shape(self, attr_t x):
|
||||||
self.c.shape = x
|
self.c.shape = x
|
||||||
|
|
||||||
property prefix:
|
@property
|
||||||
|
def prefix(self):
|
||||||
"""RETURNS (uint64): Length-N substring from the start of the word.
|
"""RETURNS (uint64): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.c.prefix
|
return self.c.prefix
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@prefix.setter
|
||||||
|
def prefix(self, attr_t x):
|
||||||
self.c.prefix = x
|
self.c.prefix = x
|
||||||
|
|
||||||
property suffix:
|
@property
|
||||||
|
def suffix(self):
|
||||||
"""RETURNS (uint64): Length-N substring from the end of the word.
|
"""RETURNS (uint64): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.c.suffix
|
return self.c.suffix
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@suffix.setter
|
||||||
|
def suffix(self, attr_t x):
|
||||||
self.c.suffix = x
|
self.c.suffix = x
|
||||||
|
|
||||||
property cluster:
|
@property
|
||||||
|
def cluster(self):
|
||||||
"""RETURNS (int): Brown cluster ID."""
|
"""RETURNS (int): Brown cluster ID."""
|
||||||
def __get__(self):
|
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
return cluster_table.get(self.c.orth, 0)
|
return cluster_table.get(self.c.orth, 0)
|
||||||
|
|
||||||
def __set__(self, int x):
|
@cluster.setter
|
||||||
|
def cluster(self, int x):
|
||||||
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
|
||||||
cluster_table[self.c.orth] = x
|
cluster_table[self.c.orth] = x
|
||||||
|
|
||||||
property lang:
|
@property
|
||||||
|
def lang(self):
|
||||||
"""RETURNS (uint64): Language of the parent vocabulary."""
|
"""RETURNS (uint64): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.lang
|
return self.c.lang
|
||||||
|
|
||||||
def __set__(self, attr_t x):
|
@lang.setter
|
||||||
|
def lang(self, attr_t x):
|
||||||
self.c.lang = x
|
self.c.lang = x
|
||||||
|
|
||||||
property prob:
|
@property
|
||||||
|
def prob(self):
|
||||||
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
|
||||||
type."""
|
type."""
|
||||||
def __get__(self):
|
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
|
||||||
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
default_oov_prob = settings_table.get("oov_prob", -20.0)
|
||||||
return prob_table.get(self.c.orth, default_oov_prob)
|
return prob_table.get(self.c.orth, default_oov_prob)
|
||||||
|
|
||||||
def __set__(self, float x):
|
@prob.setter
|
||||||
|
def prob(self, float x):
|
||||||
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
|
||||||
prob_table[self.c.orth] = x
|
prob_table[self.c.orth] = x
|
||||||
|
|
||||||
property lower_:
|
@property
|
||||||
|
def lower_(self):
|
||||||
"""RETURNS (str): Lowercase form of the word."""
|
"""RETURNS (str): Lowercase form of the word."""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.lower]
|
return self.vocab.strings[self.c.lower]
|
||||||
|
|
||||||
def __set__(self, str x):
|
@lower_.setter
|
||||||
|
def lower_(self, str x):
|
||||||
self.c.lower = self.vocab.strings.add(x)
|
self.c.lower = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property norm_:
|
@property
|
||||||
|
def norm_(self):
|
||||||
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
|
||||||
lexeme text.
|
lexeme text.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.norm]
|
return self.vocab.strings[self.c.norm]
|
||||||
|
|
||||||
def __set__(self, str x):
|
@norm_.setter
|
||||||
|
def norm_(self, str x):
|
||||||
self.norm = self.vocab.strings.add(x)
|
self.norm = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property shape_:
|
@property
|
||||||
|
def shape_(self):
|
||||||
"""RETURNS (str): Transform of the word's string, to show
|
"""RETURNS (str): Transform of the word's string, to show
|
||||||
orthographic features.
|
orthographic features.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.shape]
|
return self.vocab.strings[self.c.shape]
|
||||||
|
|
||||||
def __set__(self, str x):
|
@shape_.setter
|
||||||
|
def shape_(self, str x):
|
||||||
self.c.shape = self.vocab.strings.add(x)
|
self.c.shape = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property prefix_:
|
@property
|
||||||
|
def prefix_(self):
|
||||||
"""RETURNS (str): Length-N substring from the start of the word.
|
"""RETURNS (str): Length-N substring from the start of the word.
|
||||||
Defaults to `N=1`.
|
Defaults to `N=1`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.prefix]
|
return self.vocab.strings[self.c.prefix]
|
||||||
|
|
||||||
def __set__(self, str x):
|
@prefix_.setter
|
||||||
|
def prefix_(self, str x):
|
||||||
self.c.prefix = self.vocab.strings.add(x)
|
self.c.prefix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property suffix_:
|
@property
|
||||||
|
def suffix_(self):
|
||||||
"""RETURNS (str): Length-N substring from the end of the word.
|
"""RETURNS (str): Length-N substring from the end of the word.
|
||||||
Defaults to `N=3`.
|
Defaults to `N=3`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.suffix]
|
return self.vocab.strings[self.c.suffix]
|
||||||
|
|
||||||
def __set__(self, str x):
|
@suffix_.setter
|
||||||
|
def suffix_(self, str x):
|
||||||
self.c.suffix = self.vocab.strings.add(x)
|
self.c.suffix = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property lang_:
|
@property
|
||||||
|
def lang_(self):
|
||||||
"""RETURNS (str): Language of the parent vocabulary."""
|
"""RETURNS (str): Language of the parent vocabulary."""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.lang]
|
return self.vocab.strings[self.c.lang]
|
||||||
|
|
||||||
def __set__(self, str x):
|
@lang_.setter
|
||||||
|
def lang_(self, str x):
|
||||||
self.c.lang = self.vocab.strings.add(x)
|
self.c.lang = self.vocab.strings.add(x)
|
||||||
|
|
||||||
property flags:
|
@property
|
||||||
|
def flags(self):
|
||||||
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
"""RETURNS (uint64): Container of the lexeme's binary flags."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.flags
|
return self.c.flags
|
||||||
|
|
||||||
def __set__(self, flags_t x):
|
@flags.setter
|
||||||
|
def flags(self, flags_t x):
|
||||||
self.c.flags = x
|
self.c.flags = x
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -368,154 +386,171 @@ cdef class Lexeme:
|
||||||
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
|
||||||
return self.orth not in self.vocab.vectors
|
return self.orth not in self.vocab.vectors
|
||||||
|
|
||||||
property is_stop:
|
@property
|
||||||
|
def is_stop(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
"""RETURNS (bool): Whether the lexeme is a stop word."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_STOP)
|
return Lexeme.c_check_flag(self.c, IS_STOP)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_stop.setter
|
||||||
|
def is_stop(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
Lexeme.c_set_flag(self.c, IS_STOP, x)
|
||||||
|
|
||||||
property is_alpha:
|
@property
|
||||||
|
def is_alpha(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
"""RETURNS (bool): Whether the lexeme consists of alphabetic
|
||||||
characters. Equivalent to `lexeme.text.isalpha()`.
|
characters. Equivalent to `lexeme.text.isalpha()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
return Lexeme.c_check_flag(self.c, IS_ALPHA)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_alpha.setter
|
||||||
|
def is_alpha(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
|
||||||
|
|
||||||
property is_ascii:
|
@property
|
||||||
|
def is_ascii(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
|
||||||
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
return Lexeme.c_check_flag(self.c, IS_ASCII)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_ascii.setter
|
||||||
|
def is_ascii(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
Lexeme.c_set_flag(self.c, IS_ASCII, x)
|
||||||
|
|
||||||
property is_digit:
|
@property
|
||||||
|
def is_digit(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
|
||||||
to `lexeme.text.isdigit()`.
|
to `lexeme.text.isdigit()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
return Lexeme.c_check_flag(self.c, IS_DIGIT)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_digit.setter
|
||||||
|
def is_digit(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
|
||||||
|
|
||||||
property is_lower:
|
@property
|
||||||
|
def is_lower(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
|
||||||
`lexeme.text.islower()`.
|
`lexeme.text.islower()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
return Lexeme.c_check_flag(self.c, IS_LOWER)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_lower.setter
|
||||||
|
def is_lower(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
Lexeme.c_set_flag(self.c, IS_LOWER, x)
|
||||||
|
|
||||||
property is_upper:
|
@property
|
||||||
|
def is_upper(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
|
||||||
`lexeme.text.isupper()`.
|
`lexeme.text.isupper()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
return Lexeme.c_check_flag(self.c, IS_UPPER)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_upper.setter
|
||||||
|
def is_upper(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
Lexeme.c_set_flag(self.c, IS_UPPER, x)
|
||||||
|
|
||||||
property is_title:
|
@property
|
||||||
|
def is_title(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
|
||||||
`lexeme.text.istitle()`.
|
`lexeme.text.istitle()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
return Lexeme.c_check_flag(self.c, IS_TITLE)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_title.setter
|
||||||
|
def is_title(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
Lexeme.c_set_flag(self.c, IS_TITLE, x)
|
||||||
|
|
||||||
property is_punct:
|
@property
|
||||||
|
def is_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
"""RETURNS (bool): Whether the lexeme is punctuation."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
return Lexeme.c_check_flag(self.c, IS_PUNCT)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_punct.setter
|
||||||
|
def is_punct(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
|
||||||
|
|
||||||
property is_space:
|
@property
|
||||||
|
def is_space(self):
|
||||||
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
|
||||||
Equivalent to `lexeme.text.isspace()`.
|
Equivalent to `lexeme.text.isspace()`.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_space.setter
|
||||||
|
def is_space(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||||
|
|
||||||
property is_bracket:
|
@property
|
||||||
|
def is_bracket(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
"""RETURNS (bool): Whether the lexeme is a bracket."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_bracket.setter
|
||||||
|
def is_bracket(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||||
|
|
||||||
property is_quote:
|
@property
|
||||||
|
def is_quote(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_quote.setter
|
||||||
|
def is_quote(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||||
|
|
||||||
property is_left_punct:
|
@property
|
||||||
|
def is_left_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_left_punct.setter
|
||||||
|
def is_left_punct(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||||
|
|
||||||
property is_right_punct:
|
@property
|
||||||
|
def is_right_punct(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_right_punct.setter
|
||||||
|
def is_right_punct(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||||
|
|
||||||
property is_currency:
|
@property
|
||||||
|
def is_currency(self):
|
||||||
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@is_currency.setter
|
||||||
|
def is_currency(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
|
||||||
|
|
||||||
property like_url:
|
@property
|
||||||
|
def like_url(self):
|
||||||
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
"""RETURNS (bool): Whether the lexeme resembles a URL."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_url.setter
|
||||||
|
def like_url(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||||
|
|
||||||
property like_num:
|
@property
|
||||||
|
def like_num(self):
|
||||||
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
|
||||||
"10", "ten", etc.
|
"10", "ten", etc.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
return Lexeme.c_check_flag(self.c, LIKE_NUM)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_num.setter
|
||||||
|
def like_num(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
|
||||||
|
|
||||||
property like_email:
|
@property
|
||||||
|
def like_email(self):
|
||||||
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
"""RETURNS (bool): Whether the lexeme resembles an email address."""
|
||||||
def __get__(self):
|
|
||||||
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
|
||||||
|
|
||||||
def __set__(self, bint x):
|
@like_email.setter
|
||||||
|
def like_email(self, bint x):
|
||||||
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
|
||||||
|
|
|
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
|
||||||
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
heads, labels = example.get_aligned_parse(projectivize=True)
|
heads, labels = example.get_aligned_parse(projectivize=True)
|
||||||
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
|
||||||
sent_starts = _get_aligned_sent_starts(example)
|
sent_starts = _get_aligned_sent_starts(example)
|
||||||
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
||||||
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
||||||
|
|
|
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
|
||||||
new_label, head_label = label.split(DELIMITER)
|
new_label, head_label = label.split(DELIMITER)
|
||||||
new_head = _find_new_head(doc[i], head_label)
|
new_head = _find_new_head(doc[i], head_label)
|
||||||
doc.c[i].head = new_head.i - i
|
doc.c[i].head = new_head.i - i
|
||||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
|
||||||
set_children_from_heads(doc.c, 0, doc.length)
|
set_children_from_heads(doc.c, 0, doc.length)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,6 @@ from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..kb import Candidate, KnowledgeBase
|
from ..kb import Candidate, KnowledgeBase
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..ml import empty_kb
|
|
||||||
from ..scorer import Scorer
|
from ..scorer import Scorer
|
||||||
from ..tokens import Doc, Span
|
from ..tokens import Doc, Span
|
||||||
from ..training import Example, validate_examples, validate_get_examples
|
from ..training import Example, validate_examples, validate_get_examples
|
||||||
|
@ -105,7 +104,7 @@ def make_entity_linker(
|
||||||
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
|
||||||
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
|
||||||
scorer (Optional[Callable]): The scoring method.
|
scorer (Optional[Callable]): The scoring method.
|
||||||
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
|
use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
|
||||||
component must provide entity annotations.
|
component must provide entity annotations.
|
||||||
candidates_batch_size (int): Size of batches for entity candidate generation.
|
candidates_batch_size (int): Size of batches for entity candidate generation.
|
||||||
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
|
||||||
|
@ -235,7 +234,6 @@ class EntityLinker(TrainablePipe):
|
||||||
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
|
||||||
self.distance = CosineDistance(normalize=False)
|
self.distance = CosineDistance(normalize=False)
|
||||||
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
|
||||||
self.scorer = scorer
|
|
||||||
self.use_gold_ents = use_gold_ents
|
self.use_gold_ents = use_gold_ents
|
||||||
self.candidates_batch_size = candidates_batch_size
|
self.candidates_batch_size = candidates_batch_size
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
|
@ -243,6 +241,37 @@ class EntityLinker(TrainablePipe):
|
||||||
if candidates_batch_size < 1:
|
if candidates_batch_size < 1:
|
||||||
raise ValueError(Errors.E1044)
|
raise ValueError(Errors.E1044)
|
||||||
|
|
||||||
|
def _score_with_ents_set(examples: Iterable[Example], **kwargs):
|
||||||
|
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
|
||||||
|
# calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
|
||||||
|
if not scorer:
|
||||||
|
return scorer
|
||||||
|
if not self.use_gold_ents:
|
||||||
|
return scorer(examples, **kwargs)
|
||||||
|
else:
|
||||||
|
examples = self._ensure_ents(examples)
|
||||||
|
docs = self.pipe(
|
||||||
|
(eg.predicted for eg in examples),
|
||||||
|
)
|
||||||
|
for eg, doc in zip(examples, docs):
|
||||||
|
eg.predicted = doc
|
||||||
|
return scorer(examples, **kwargs)
|
||||||
|
|
||||||
|
self.scorer = _score_with_ents_set
|
||||||
|
|
||||||
|
def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
|
||||||
|
"""If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
|
||||||
|
if not self.use_gold_ents:
|
||||||
|
return examples
|
||||||
|
|
||||||
|
new_examples = []
|
||||||
|
for eg in examples:
|
||||||
|
ents, _ = eg.get_aligned_ents_and_ner()
|
||||||
|
new_eg = eg.copy()
|
||||||
|
new_eg.predicted.ents = ents
|
||||||
|
new_examples.append(new_eg)
|
||||||
|
return new_examples
|
||||||
|
|
||||||
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
|
||||||
"""Define the KB of this pipe by providing a function that will
|
"""Define the KB of this pipe by providing a function that will
|
||||||
create it using this object's vocab."""
|
create it using this object's vocab."""
|
||||||
|
@ -284,11 +313,9 @@ class EntityLinker(TrainablePipe):
|
||||||
nO = self.kb.entity_vector_length
|
nO = self.kb.entity_vector_length
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
vector_sample = []
|
vector_sample = []
|
||||||
for eg in islice(get_examples(), 10):
|
examples = self._ensure_ents(islice(get_examples(), 10))
|
||||||
|
for eg in examples:
|
||||||
doc = eg.x
|
doc = eg.x
|
||||||
if self.use_gold_ents:
|
|
||||||
ents, _ = eg.get_aligned_ents_and_ner()
|
|
||||||
doc.ents = ents
|
|
||||||
doc_sample.append(doc)
|
doc_sample.append(doc)
|
||||||
vector_sample.append(self.model.ops.alloc1f(nO))
|
vector_sample.append(self.model.ops.alloc1f(nO))
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
|
@ -354,31 +381,17 @@ class EntityLinker(TrainablePipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return losses
|
return losses
|
||||||
|
examples = self._ensure_ents(examples)
|
||||||
validate_examples(examples, "EntityLinker.update")
|
validate_examples(examples, "EntityLinker.update")
|
||||||
|
|
||||||
set_dropout_rate(self.model, drop)
|
|
||||||
docs = [eg.predicted for eg in examples]
|
|
||||||
# save to restore later
|
|
||||||
old_ents = [doc.ents for doc in docs]
|
|
||||||
|
|
||||||
for doc, ex in zip(docs, examples):
|
|
||||||
if self.use_gold_ents:
|
|
||||||
ents, _ = ex.get_aligned_ents_and_ner()
|
|
||||||
doc.ents = ents
|
|
||||||
else:
|
|
||||||
# only keep matching ents
|
|
||||||
doc.ents = ex.get_matching_ents()
|
|
||||||
|
|
||||||
# make sure we have something to learn from, if not, short-circuit
|
# make sure we have something to learn from, if not, short-circuit
|
||||||
if not self.batch_has_learnable_example(examples):
|
if not self.batch_has_learnable_example(examples):
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
sentence_encodings, bp_context = self.model.begin_update(docs)
|
sentence_encodings, bp_context = self.model.begin_update(docs)
|
||||||
|
|
||||||
# now restore the ents
|
|
||||||
for doc, old in zip(docs, old_ents):
|
|
||||||
doc.ents = old
|
|
||||||
|
|
||||||
loss, d_scores = self.get_loss(
|
loss, d_scores = self.get_loss(
|
||||||
sentence_encodings=sentence_encodings, examples=examples
|
sentence_encodings=sentence_encodings, examples=examples
|
||||||
)
|
)
|
||||||
|
@ -386,11 +399,13 @@ class EntityLinker(TrainablePipe):
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
|
||||||
validate_examples(examples, "EntityLinker.get_loss")
|
validate_examples(examples, "EntityLinker.get_loss")
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
|
# We assume that get_loss is called with gold ents set in the examples if need be
|
||||||
eidx = 0 # indices in gold entities to keep
|
eidx = 0 # indices in gold entities to keep
|
||||||
keep_ents = [] # indices in sentence_encodings to keep
|
keep_ents = [] # indices in sentence_encodings to keep
|
||||||
|
|
||||||
|
|
|
@ -25,5 +25,7 @@ cdef class StringStore:
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
|
||||||
|
cdef vector[hash_t] _transient_keys
|
||||||
|
cdef Pool _non_temp_mem
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=False
|
# cython: profile=False
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Iterator, List, Optional
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
from murmurhash.mrmr cimport hash32, hash64
|
from murmurhash.mrmr cimport hash32, hash64
|
||||||
|
from preshed.maps cimport map_clear
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
@ -119,10 +124,11 @@ cdef class StringStore:
|
||||||
strings (iterable): A sequence of unicode strings to add to the store.
|
strings (iterable): A sequence of unicode strings to add to the store.
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self._non_temp_mem = self.mem
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
if strings is not None:
|
if strings is not None:
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string, allow_transient=False)
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
"""Retrieve a string from a given hash, or vice versa.
|
"""Retrieve a string from a given hash, or vice versa.
|
||||||
|
@ -152,10 +158,13 @@ cdef class StringStore:
|
||||||
return SYMBOLS_BY_INT[str_hash]
|
return SYMBOLS_BY_INT[str_hash]
|
||||||
else:
|
else:
|
||||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||||
|
if utf8str is NULL:
|
||||||
|
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||||
|
else:
|
||||||
|
return decode_Utf8Str(utf8str)
|
||||||
else:
|
else:
|
||||||
# TODO: Raise an error instead
|
# TODO: Raise an error instead
|
||||||
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
||||||
|
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||||
else:
|
else:
|
||||||
|
@ -175,12 +184,46 @@ cdef class StringStore:
|
||||||
else:
|
else:
|
||||||
return self[key]
|
return self[key]
|
||||||
|
|
||||||
def add(self, string):
|
def __len__(self) -> int:
|
||||||
|
"""The number of strings in the store.
|
||||||
|
|
||||||
|
RETURNS (int): The number of strings in the store.
|
||||||
|
"""
|
||||||
|
return self.keys.size() + self._transient_keys.size()
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
|
||||||
|
"""Begin a block where all resources allocated during the block will
|
||||||
|
be freed at the end of it. If a resources was created within the
|
||||||
|
memory zone block, accessing it outside the block is invalid.
|
||||||
|
Behaviour of this invalid access is undefined. Memory zones should
|
||||||
|
not be nested.
|
||||||
|
|
||||||
|
The memory zone is helpful for services that need to process large
|
||||||
|
volumes of text with a defined memory budget.
|
||||||
|
"""
|
||||||
|
if mem is None:
|
||||||
|
mem = Pool()
|
||||||
|
self.mem = mem
|
||||||
|
yield mem
|
||||||
|
for key in self._transient_keys:
|
||||||
|
map_clear(self._map.c_map, key)
|
||||||
|
self._transient_keys.clear()
|
||||||
|
self.mem = self._non_temp_mem
|
||||||
|
|
||||||
|
def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
|
||||||
"""Add a string to the StringStore.
|
"""Add a string to the StringStore.
|
||||||
|
|
||||||
string (str): The string to add.
|
string (str): The string to add.
|
||||||
|
allow_transient (bool): Allow the string to be stored in the 'transient'
|
||||||
|
map, which will be flushed at the end of the memory zone. Strings
|
||||||
|
encountered during arbitrary text processing should be added
|
||||||
|
with allow_transient=True, while labels and other strings used
|
||||||
|
internally should not.
|
||||||
RETURNS (uint64): The string's hash value.
|
RETURNS (uint64): The string's hash value.
|
||||||
"""
|
"""
|
||||||
|
if allow_transient is None:
|
||||||
|
allow_transient = self.mem is not self._non_temp_mem
|
||||||
cdef hash_t str_hash
|
cdef hash_t str_hash
|
||||||
if isinstance(string, str):
|
if isinstance(string, str):
|
||||||
if string in SYMBOLS_BY_STR:
|
if string in SYMBOLS_BY_STR:
|
||||||
|
@ -188,22 +231,26 @@ cdef class StringStore:
|
||||||
|
|
||||||
string = string.encode("utf8")
|
string = string.encode("utf8")
|
||||||
str_hash = hash_utf8(string, len(string))
|
str_hash = hash_utf8(string, len(string))
|
||||||
self._intern_utf8(string, len(string), &str_hash)
|
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||||
elif isinstance(string, bytes):
|
elif isinstance(string, bytes):
|
||||||
if string in SYMBOLS_BY_STR:
|
if string in SYMBOLS_BY_STR:
|
||||||
return SYMBOLS_BY_STR[string]
|
return SYMBOLS_BY_STR[string]
|
||||||
str_hash = hash_utf8(string, len(string))
|
str_hash = hash_utf8(string, len(string))
|
||||||
self._intern_utf8(string, len(string), &str_hash)
|
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||||
else:
|
else:
|
||||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||||
return str_hash
|
return str_hash
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of strings in the store.
|
"""The number of strings in the store.
|
||||||
|
if string in SYMBOLS_BY_STR:
|
||||||
|
return SYMBOLS_BY_STR[string]
|
||||||
|
else:
|
||||||
|
return self._intern_str(string, allow_transient)
|
||||||
|
|
||||||
RETURNS (int): The number of strings in the store.
|
RETURNS (int): The number of strings in the store.
|
||||||
"""
|
"""
|
||||||
return self.keys.size()
|
return self.keys.size() + self._transient_keys.size()
|
||||||
|
|
||||||
def __contains__(self, string_or_id not None):
|
def __contains__(self, string_or_id not None):
|
||||||
"""Check whether a string or ID is in the store.
|
"""Check whether a string or ID is in the store.
|
||||||
|
@ -222,12 +269,17 @@ cdef class StringStore:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# TODO: Raise an error instead
|
# TODO: Raise an error instead
|
||||||
return self._map.get(string_or_id) is not NULL
|
if self._map.get(string_or_id) is not NULL:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
if str_hash < len(SYMBOLS_BY_INT):
|
if str_hash < len(SYMBOLS_BY_INT):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return self._map.get(str_hash) is not NULL
|
if self._map.get(str_hash) is not NULL:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the strings in the store, in order.
|
"""Iterate over the strings in the store, in order.
|
||||||
|
@ -240,12 +292,29 @@ cdef class StringStore:
|
||||||
key = self.keys[i]
|
key = self.keys[i]
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
yield decode_Utf8Str(utf8str)
|
yield decode_Utf8Str(utf8str)
|
||||||
# TODO: Iterate OOV here?
|
for i in range(self._transient_keys.size()):
|
||||||
|
key = self._transient_keys[i]
|
||||||
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
|
yield decode_Utf8Str(utf8str)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
strings = list(self)
|
strings = list(self)
|
||||||
return (StringStore, (strings,), None, None, None)
|
return (StringStore, (strings,), None, None, None)
|
||||||
|
|
||||||
|
def values(self) -> List[int]:
|
||||||
|
"""Iterate over the stored strings hashes in insertion order.
|
||||||
|
|
||||||
|
RETURNS: A list of string hashs.
|
||||||
|
"""
|
||||||
|
cdef int i
|
||||||
|
hashes = [None] * self._keys.size()
|
||||||
|
for i in range(self._keys.size()):
|
||||||
|
hashes[i] = self._keys[i]
|
||||||
|
transient_hashes = [None] * self._transient_keys.size()
|
||||||
|
for i in range(self._transient_keys.size()):
|
||||||
|
transient_hashes[i] = self._transient_keys[i]
|
||||||
|
return hashes + transient_hashes
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -269,7 +338,7 @@ cdef class StringStore:
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
self.add(word)
|
self.add(word, allow_transient=False)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
|
@ -289,23 +358,25 @@ cdef class StringStore:
|
||||||
prev = list(self)
|
prev = list(self)
|
||||||
self._reset_and_load(strings)
|
self._reset_and_load(strings)
|
||||||
for word in prev:
|
for word in prev:
|
||||||
self.add(word)
|
self.add(word, allow_transient=False)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _reset_and_load(self, strings):
|
def _reset_and_load(self, strings):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
self._non_temp_mem = self.mem
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self.keys.clear()
|
self.keys.clear()
|
||||||
|
self._transient_keys.clear()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string, allow_transient=False)
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode("utf8")
|
cdef bytes byte_string = py_string.encode("utf8")
|
||||||
return self._intern_utf8(byte_string, len(byte_string), NULL)
|
return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
|
||||||
|
|
||||||
@cython.final
|
@cython.final
|
||||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
|
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
|
||||||
# TODO: This function's API/behaviour is an unholy mess...
|
# TODO: This function's API/behaviour is an unholy mess...
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
||||||
|
@ -314,5 +385,8 @@ cdef class StringStore:
|
||||||
return value
|
return value
|
||||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||||
self._map.set(key, value)
|
self._map.set(key, value)
|
||||||
|
if allow_transient and self.mem is not self._non_temp_mem:
|
||||||
|
self._transient_keys.push_back(key)
|
||||||
|
else:
|
||||||
self.keys.push_back(key)
|
self.keys.push_back(key)
|
||||||
return value
|
return value
|
||||||
|
|
|
@ -81,6 +81,11 @@ def bn_tokenizer():
|
||||||
return get_lang_class("bn")().tokenizer
|
return get_lang_class("bn")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def bo_tokenizer():
|
||||||
|
return get_lang_class("bo")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ca_tokenizer():
|
def ca_tokenizer():
|
||||||
return get_lang_class("ca")().tokenizer
|
return get_lang_class("ca")().tokenizer
|
||||||
|
|
0
spacy/tests/lang/bo/__init__.py
Normal file
0
spacy/tests/lang/bo/__init__.py
Normal file
21
spacy/tests/lang/bo/test_text.py
Normal file
21
spacy/tests/lang/bo/test_text.py
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,match",
|
||||||
|
[
|
||||||
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("999.0", True),
|
||||||
|
("གཅིག་", True),
|
||||||
|
("གཉིས་", True),
|
||||||
|
("ཀླད་ཀོར་", True),
|
||||||
|
("བཅུ་གཅིག་", True),
|
||||||
|
("ཁྱི་", False),
|
||||||
|
(",", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs_like_number(bo_tokenizer, text, match):
|
||||||
|
tokens = bo_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == match
|
0
spacy/tests/lang/kmr/__init__.py
Normal file
0
spacy/tests/lang/kmr/__init__.py
Normal file
27
spacy/tests/lang/kmr/test_text.py
Normal file
27
spacy/tests/lang/kmr/test_text.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.lang.kmr.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word",
|
||||||
|
[
|
||||||
|
"yekem",
|
||||||
|
"duyemîn",
|
||||||
|
"100em",
|
||||||
|
"dehem",
|
||||||
|
"sedemîn",
|
||||||
|
"34em",
|
||||||
|
"30yem",
|
||||||
|
"20emîn",
|
||||||
|
"50yemîn",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_kmr_lex_attrs_like_number_for_ordinal(word):
|
||||||
|
assert like_num(word)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["deh"])
|
||||||
|
def test_kmr_lex_attrs_capitals(word):
|
||||||
|
assert like_num(word)
|
||||||
|
assert like_num(word.upper())
|
|
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
||||||
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
||||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||||
"tr", "tt", "uk", "ur", "xx", "yo"]
|
"tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -717,7 +717,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO_gold_entities():
|
||||||
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
|
||||||
nlp = English()
|
nlp = English()
|
||||||
vector_length = 3
|
vector_length = 3
|
||||||
|
@ -744,7 +744,9 @@ def test_overfitting_IO():
|
||||||
return mykb
|
return mykb
|
||||||
|
|
||||||
# Create the Entity Linker component and add it to the pipeline
|
# Create the Entity Linker component and add it to the pipeline
|
||||||
entity_linker = nlp.add_pipe("entity_linker", last=True)
|
entity_linker = nlp.add_pipe(
|
||||||
|
"entity_linker", last=True, config={"use_gold_ents": True}
|
||||||
|
)
|
||||||
assert isinstance(entity_linker, EntityLinker)
|
assert isinstance(entity_linker, EntityLinker)
|
||||||
entity_linker.set_kb(create_kb)
|
entity_linker.set_kb(create_kb)
|
||||||
assert "Q2146908" in entity_linker.vocab.strings
|
assert "Q2146908" in entity_linker.vocab.strings
|
||||||
|
@ -807,6 +809,107 @@ def test_overfitting_IO():
|
||||||
assert_equal(batch_deps_1, batch_deps_2)
|
assert_equal(batch_deps_1, batch_deps_2)
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_deps_1, no_batch_deps)
|
||||||
|
|
||||||
|
eval = nlp.evaluate(train_examples)
|
||||||
|
assert "nel_macro_p" in eval
|
||||||
|
assert "nel_macro_r" in eval
|
||||||
|
assert "nel_macro_f" in eval
|
||||||
|
assert "nel_micro_p" in eval
|
||||||
|
assert "nel_micro_r" in eval
|
||||||
|
assert "nel_micro_f" in eval
|
||||||
|
assert "nel_f_per_type" in eval
|
||||||
|
assert "PERSON" in eval["nel_f_per_type"]
|
||||||
|
|
||||||
|
assert eval["nel_macro_f"] > 0
|
||||||
|
assert eval["nel_micro_f"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_overfitting_IO_with_ner():
|
||||||
|
# Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
|
||||||
|
nlp = English()
|
||||||
|
vector_length = 3
|
||||||
|
assert "Q2146908" not in nlp.vocab.strings
|
||||||
|
|
||||||
|
# Convert the texts to docs to make sure we have doc.ents set for the training examples
|
||||||
|
train_examples = []
|
||||||
|
for text, annotation in TRAIN_DATA:
|
||||||
|
doc = nlp(text)
|
||||||
|
train_examples.append(Example.from_dict(doc, annotation))
|
||||||
|
|
||||||
|
def create_kb(vocab):
|
||||||
|
# create artificial KB - assign same prior weight to the two russ cochran's
|
||||||
|
# Q2146908 (Russ Cochran): American golfer
|
||||||
|
# Q7381115 (Russ Cochran): publisher
|
||||||
|
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
|
||||||
|
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
|
||||||
|
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
|
||||||
|
mykb.add_alias(
|
||||||
|
alias="Russ Cochran",
|
||||||
|
entities=["Q2146908", "Q7381115"],
|
||||||
|
probabilities=[0.5, 0.5],
|
||||||
|
)
|
||||||
|
return mykb
|
||||||
|
|
||||||
|
# Create the NER and EL components and add them to the pipeline
|
||||||
|
ner = nlp.add_pipe("ner", first=True)
|
||||||
|
entity_linker = nlp.add_pipe(
|
||||||
|
"entity_linker", last=True, config={"use_gold_ents": False}
|
||||||
|
)
|
||||||
|
entity_linker.set_kb(create_kb)
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
for ent in annotations.get("entities"):
|
||||||
|
ner.add_label(ent[2])
|
||||||
|
optimizer = nlp.initialize()
|
||||||
|
|
||||||
|
# train the NER and NEL pipes
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["ner"] < 0.001
|
||||||
|
assert losses["entity_linker"] < 0.001
|
||||||
|
|
||||||
|
# adding additional components that are required for the entity_linker
|
||||||
|
nlp.add_pipe("sentencizer", first=True)
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "Russ Cochran captured his first major title with his son as caddie."
|
||||||
|
doc = nlp(test_text)
|
||||||
|
ents = doc.ents
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].text == "Russ Cochran"
|
||||||
|
assert ents[0].label_ == "PERSON"
|
||||||
|
assert ents[0].kb_id_ != "NIL"
|
||||||
|
|
||||||
|
# TODO: below assert is still flaky - EL doesn't properly overfit quite yet
|
||||||
|
# assert ents[0].kb_id_ == "Q2146908"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
assert nlp2.pipe_names == nlp.pipe_names
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
ents2 = doc2.ents
|
||||||
|
assert len(ents2) == 1
|
||||||
|
assert ents2[0].text == "Russ Cochran"
|
||||||
|
assert ents2[0].label_ == "PERSON"
|
||||||
|
assert ents2[0].kb_id_ != "NIL"
|
||||||
|
|
||||||
|
eval = nlp.evaluate(train_examples)
|
||||||
|
assert "nel_macro_f" in eval
|
||||||
|
assert "nel_micro_f" in eval
|
||||||
|
assert "ents_f" in eval
|
||||||
|
assert "nel_f_per_type" in eval
|
||||||
|
assert "ents_per_type" in eval
|
||||||
|
assert "PERSON" in eval["nel_f_per_type"]
|
||||||
|
assert "PERSON" in eval["ents_per_type"]
|
||||||
|
|
||||||
|
assert eval["nel_macro_f"] > 0
|
||||||
|
assert eval["nel_micro_f"] > 0
|
||||||
|
assert eval["ents_f"] > 0
|
||||||
|
|
||||||
|
|
||||||
def test_kb_serialization():
|
def test_kb_serialization():
|
||||||
# Test that the KB can be used in a pipeline with a different vocab
|
# Test that the KB can be used in a pipeline with a different vocab
|
||||||
|
|
|
@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process):
|
||||||
nlp.set_error_handler(raise_error)
|
nlp.set_error_handler(raise_error)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(nlp.pipe(texts, n_process=n_process))
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
# set explicitely to ignoring
|
# set explicitly to ignoring
|
||||||
nlp.set_error_handler(ignore_error)
|
nlp.set_error_handler(ignore_error)
|
||||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
assert len(docs) == 0
|
assert len(docs) == 0
|
||||||
|
|
|
@ -18,6 +18,7 @@ LANGUAGES = [
|
||||||
pytest.param("ar", marks=pytest.mark.slow()),
|
pytest.param("ar", marks=pytest.mark.slow()),
|
||||||
pytest.param("bg", marks=pytest.mark.slow()),
|
pytest.param("bg", marks=pytest.mark.slow()),
|
||||||
"bn",
|
"bn",
|
||||||
|
pytest.param("bo", marks=pytest.mark.slow()),
|
||||||
pytest.param("ca", marks=pytest.mark.slow()),
|
pytest.param("ca", marks=pytest.mark.slow()),
|
||||||
pytest.param("cs", marks=pytest.mark.slow()),
|
pytest.param("cs", marks=pytest.mark.slow()),
|
||||||
pytest.param("da", marks=pytest.mark.slow()),
|
pytest.param("da", marks=pytest.mark.slow()),
|
||||||
|
@ -57,6 +58,7 @@ LANGUAGES = [
|
||||||
pytest.param("tr", marks=pytest.mark.slow()),
|
pytest.param("tr", marks=pytest.mark.slow()),
|
||||||
pytest.param("tt", marks=pytest.mark.slow()),
|
pytest.param("tt", marks=pytest.mark.slow()),
|
||||||
pytest.param("ur", marks=pytest.mark.slow()),
|
pytest.param("ur", marks=pytest.mark.slow()),
|
||||||
|
pytest.param("kmr", marks=pytest.mark.slow()),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_zone_no_insertion():
|
||||||
|
vocab = Vocab()
|
||||||
|
with vocab.memory_zone():
|
||||||
|
pass
|
||||||
|
lex = vocab["horse"]
|
||||||
|
assert lex.text == "horse"
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_zone_insertion():
|
||||||
|
vocab = Vocab()
|
||||||
|
_ = vocab["dog"]
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
||||||
|
with vocab.memory_zone():
|
||||||
|
lex = vocab["horse"]
|
||||||
|
assert lex.text == "horse"
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_zone_redundant_insertion():
|
||||||
|
"""Test that if we insert an already-existing word while
|
||||||
|
in the memory zone, it stays persistent"""
|
||||||
|
vocab = Vocab()
|
||||||
|
_ = vocab["dog"]
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
||||||
|
with vocab.memory_zone():
|
||||||
|
lex = vocab["horse"]
|
||||||
|
assert lex.text == "horse"
|
||||||
|
_ = vocab["dog"]
|
||||||
|
assert "dog" in vocab
|
||||||
|
assert "horse" not in vocab
|
|
@ -25,9 +25,7 @@ cdef class Tokenizer:
|
||||||
cdef PhraseMatcher _special_matcher
|
cdef PhraseMatcher _special_matcher
|
||||||
# TODO convert to bool in v4
|
# TODO convert to bool in v4
|
||||||
cdef int _faster_heuristics
|
cdef int _faster_heuristics
|
||||||
# TODO next one is unused and should be removed in v4
|
cdef public int max_cache_size
|
||||||
# https://github.com/explosion/spaCy/pull/9150
|
|
||||||
cdef int _unused_int2
|
|
||||||
|
|
||||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||||
|
|
|
@ -30,7 +30,7 @@ cdef class Tokenizer:
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||||
suffix_search=None, infix_finditer=None, token_match=None,
|
suffix_search=None, infix_finditer=None, token_match=None,
|
||||||
url_match=None, faster_heuristics=True):
|
url_match=None, faster_heuristics=True, max_cache_size=10000):
|
||||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
|
|
||||||
vocab (Vocab): A storage container for lexical types.
|
vocab (Vocab): A storage container for lexical types.
|
||||||
|
@ -50,6 +50,7 @@ cdef class Tokenizer:
|
||||||
faster_heuristics (bool): Whether to restrict the final
|
faster_heuristics (bool): Whether to restrict the final
|
||||||
Matcher-based pass for rules to those containing affixes or space.
|
Matcher-based pass for rules to those containing affixes or space.
|
||||||
Defaults to True.
|
Defaults to True.
|
||||||
|
max_cache_size (int): Maximum number of tokenization chunks to cache.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
@ -69,52 +70,59 @@ cdef class Tokenizer:
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._special_matcher = PhraseMatcher(self.vocab)
|
self._special_matcher = PhraseMatcher(self.vocab)
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
|
self.max_cache_size = max_cache_size
|
||||||
|
|
||||||
property token_match:
|
@property
|
||||||
def __get__(self):
|
def token_match(self):
|
||||||
return self._token_match
|
return self._token_match
|
||||||
|
|
||||||
def __set__(self, token_match):
|
@token_match.setter
|
||||||
|
def token_match(self, token_match):
|
||||||
self._token_match = token_match
|
self._token_match = token_match
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
property url_match:
|
@property
|
||||||
def __get__(self):
|
def url_match(self):
|
||||||
return self._url_match
|
return self._url_match
|
||||||
|
|
||||||
def __set__(self, url_match):
|
@url_match.setter
|
||||||
|
def url_match(self, url_match):
|
||||||
self._url_match = url_match
|
self._url_match = url_match
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
property prefix_search:
|
@property
|
||||||
def __get__(self):
|
def prefix_search(self):
|
||||||
return self._prefix_search
|
return self._prefix_search
|
||||||
|
|
||||||
def __set__(self, prefix_search):
|
@prefix_search.setter
|
||||||
|
def prefix_search(self, prefix_search):
|
||||||
self._prefix_search = prefix_search
|
self._prefix_search = prefix_search
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
property suffix_search:
|
@property
|
||||||
def __get__(self):
|
def suffix_search(self):
|
||||||
return self._suffix_search
|
return self._suffix_search
|
||||||
|
|
||||||
def __set__(self, suffix_search):
|
@suffix_search.setter
|
||||||
|
def suffix_search(self, suffix_search):
|
||||||
self._suffix_search = suffix_search
|
self._suffix_search = suffix_search
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
property infix_finditer:
|
@property
|
||||||
def __get__(self):
|
def infix_finditer(self):
|
||||||
return self._infix_finditer
|
return self._infix_finditer
|
||||||
|
|
||||||
def __set__(self, infix_finditer):
|
@infix_finditer.setter
|
||||||
|
def infix_finditer(self, infix_finditer):
|
||||||
self._infix_finditer = infix_finditer
|
self._infix_finditer = infix_finditer
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
property rules:
|
@property
|
||||||
def __get__(self):
|
def rules(self):
|
||||||
return self._rules
|
return self._rules
|
||||||
|
|
||||||
def __set__(self, rules):
|
@rules.setter
|
||||||
|
def rules(self, rules):
|
||||||
self._rules = {}
|
self._rules = {}
|
||||||
self._flush_cache()
|
self._flush_cache()
|
||||||
self._flush_specials()
|
self._flush_specials()
|
||||||
|
@ -122,11 +130,12 @@ cdef class Tokenizer:
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
self._load_special_cases(rules)
|
self._load_special_cases(rules)
|
||||||
|
|
||||||
property faster_heuristics:
|
@property
|
||||||
def __get__(self):
|
def faster_heuristics(self):
|
||||||
return bool(self._faster_heuristics)
|
return bool(self._faster_heuristics)
|
||||||
|
|
||||||
def __set__(self, faster_heuristics):
|
@faster_heuristics.setter
|
||||||
|
def faster_heuristics(self, faster_heuristics):
|
||||||
self._faster_heuristics = bool(faster_heuristics)
|
self._faster_heuristics = bool(faster_heuristics)
|
||||||
self._reload_special_cases()
|
self._reload_special_cases()
|
||||||
|
|
||||||
|
@ -390,6 +399,7 @@ cdef class Tokenizer:
|
||||||
has_special, with_special_cases)
|
has_special, with_special_cases)
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
||||||
with_special_cases)
|
with_special_cases)
|
||||||
|
if len(self._cache) < self.max_cache_size:
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||||
tokens.length - orig_size)
|
tokens.length - orig_size)
|
||||||
|
|
||||||
|
@ -507,8 +517,7 @@ cdef class Tokenizer:
|
||||||
if n <= 0:
|
if n <= 0:
|
||||||
# avoid mem alloc of zero length
|
# avoid mem alloc of zero length
|
||||||
return 0
|
return 0
|
||||||
for i in range(n):
|
if self.vocab.in_memory_zone:
|
||||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
|
||||||
return 0
|
return 0
|
||||||
# See #1250
|
# See #1250
|
||||||
if has_special[0]:
|
if has_special[0]:
|
||||||
|
|
|
@ -667,7 +667,8 @@ cdef class Doc:
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
property vector:
|
@property
|
||||||
|
def vector(self):
|
||||||
"""A real-valued meaning representation. Defaults to an average of the
|
"""A real-valued meaning representation. Defaults to an average of the
|
||||||
token vectors.
|
token vectors.
|
||||||
|
|
||||||
|
@ -676,7 +677,6 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#vector
|
DOCS: https://spacy.io/api/doc#vector
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
if "vector" in self.user_hooks:
|
if "vector" in self.user_hooks:
|
||||||
return self.user_hooks["vector"](self)
|
return self.user_hooks["vector"](self)
|
||||||
if self._vector is not None:
|
if self._vector is not None:
|
||||||
|
@ -694,17 +694,18 @@ cdef class Doc:
|
||||||
else:
|
else:
|
||||||
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
|
||||||
|
|
||||||
def __set__(self, value):
|
@vector.setter
|
||||||
|
def vector(self, value):
|
||||||
self._vector = value
|
self._vector = value
|
||||||
|
|
||||||
property vector_norm:
|
@property
|
||||||
|
def vector_norm(self):
|
||||||
"""The L2 norm of the document's vector representation.
|
"""The L2 norm of the document's vector representation.
|
||||||
|
|
||||||
RETURNS (float): The L2 norm of the vector representation.
|
RETURNS (float): The L2 norm of the vector representation.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#vector_norm
|
DOCS: https://spacy.io/api/doc#vector_norm
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
if "vector_norm" in self.user_hooks:
|
if "vector_norm" in self.user_hooks:
|
||||||
return self.user_hooks["vector_norm"](self)
|
return self.user_hooks["vector_norm"](self)
|
||||||
cdef float value
|
cdef float value
|
||||||
|
@ -716,7 +717,8 @@ cdef class Doc:
|
||||||
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
||||||
return self._vector_norm
|
return self._vector_norm
|
||||||
|
|
||||||
def __set__(self, value):
|
@vector_norm.setter
|
||||||
|
def vector_norm(self, value):
|
||||||
self._vector_norm = value
|
self._vector_norm = value
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -736,7 +738,8 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
property ents:
|
@property
|
||||||
|
def ents(self):
|
||||||
"""The named entities in the document. Returns a tuple of named entity
|
"""The named entities in the document. Returns a tuple of named entity
|
||||||
`Span` objects, if the entity recognizer has been applied.
|
`Span` objects, if the entity recognizer has been applied.
|
||||||
|
|
||||||
|
@ -744,7 +747,6 @@ cdef class Doc:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/doc#ents
|
DOCS: https://spacy.io/api/doc#ents
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
|
@ -779,7 +781,8 @@ cdef class Doc:
|
||||||
output = [o for o in output if o.label_ != ""]
|
output = [o for o in output if o.label_ != ""]
|
||||||
return tuple(output)
|
return tuple(output)
|
||||||
|
|
||||||
def __set__(self, ents):
|
@ents.setter
|
||||||
|
def ents(self, ents):
|
||||||
# TODO:
|
# TODO:
|
||||||
# 1. Test basic data-driven ORTH gazetteer
|
# 1. Test basic data-driven ORTH gazetteer
|
||||||
# 2. Test more nuanced date and currency regex
|
# 2. Test more nuanced date and currency regex
|
||||||
|
|
|
@ -757,77 +757,86 @@ cdef class Span:
|
||||||
for word in self.rights:
|
for word in self.rights:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property start:
|
@property
|
||||||
def __get__(self):
|
def start(self):
|
||||||
return self.c.start
|
return self.c.start
|
||||||
|
|
||||||
def __set__(self, int start):
|
@start.setter
|
||||||
|
def start(self, int start):
|
||||||
if start < 0:
|
if start < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
|
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
|
||||||
self.c.start = start
|
self.c.start = start
|
||||||
|
|
||||||
property end:
|
@property
|
||||||
def __get__(self):
|
def end(self):
|
||||||
return self.c.end
|
return self.c.end
|
||||||
|
|
||||||
def __set__(self, int end):
|
@end.setter
|
||||||
|
def end(self, int end):
|
||||||
if end < 0:
|
if end < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
|
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
|
||||||
self.c.end = end
|
self.c.end = end
|
||||||
|
|
||||||
property start_char:
|
@property
|
||||||
def __get__(self):
|
def start_char(self):
|
||||||
return self.c.start_char
|
return self.c.start_char
|
||||||
|
|
||||||
def __set__(self, int start_char):
|
@start_char.setter
|
||||||
|
def start_char(self, int start_char):
|
||||||
if start_char < 0:
|
if start_char < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
|
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
|
||||||
self.c.start_char = start_char
|
self.c.start_char = start_char
|
||||||
|
|
||||||
property end_char:
|
@property
|
||||||
def __get__(self):
|
def end_char(self):
|
||||||
return self.c.end_char
|
return self.c.end_char
|
||||||
|
|
||||||
def __set__(self, int end_char):
|
@end_char.setter
|
||||||
|
def end_char(self, int end_char):
|
||||||
if end_char < 0:
|
if end_char < 0:
|
||||||
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
|
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
|
||||||
self.c.end_char = end_char
|
self.c.end_char = end_char
|
||||||
|
|
||||||
property label:
|
@property
|
||||||
def __get__(self):
|
def label(self):
|
||||||
return self.c.label
|
return self.c.label
|
||||||
|
|
||||||
def __set__(self, attr_t label):
|
@label.setter
|
||||||
|
def label(self, attr_t label):
|
||||||
self.c.label = label
|
self.c.label = label
|
||||||
|
|
||||||
property kb_id:
|
@property
|
||||||
def __get__(self):
|
def kb_id(self):
|
||||||
return self.c.kb_id
|
return self.c.kb_id
|
||||||
|
|
||||||
def __set__(self, attr_t kb_id):
|
@kb_id.setter
|
||||||
|
def kb_id(self, attr_t kb_id):
|
||||||
self.c.kb_id = kb_id
|
self.c.kb_id = kb_id
|
||||||
|
|
||||||
property id:
|
@property
|
||||||
def __get__(self):
|
def id(self):
|
||||||
return self.c.id
|
return self.c.id
|
||||||
|
|
||||||
def __set__(self, attr_t id):
|
@id.setter
|
||||||
|
def id(self, attr_t id):
|
||||||
self.c.id = id
|
self.c.id = id
|
||||||
|
|
||||||
property ent_id:
|
@property
|
||||||
|
def ent_id(self):
|
||||||
"""RETURNS (uint64): The entity ID."""
|
"""RETURNS (uint64): The entity ID."""
|
||||||
def __get__(self):
|
|
||||||
return self.root.ent_id
|
return self.root.ent_id
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
@ent_id.setter
|
||||||
|
def ent_id(self, hash_t key):
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
|
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
|
||||||
|
|
||||||
property ent_id_:
|
@property
|
||||||
|
def ent_id_(self):
|
||||||
"""RETURNS (str): The (string) entity ID."""
|
"""RETURNS (str): The (string) entity ID."""
|
||||||
def __get__(self):
|
|
||||||
return self.root.ent_id_
|
return self.root.ent_id_
|
||||||
|
|
||||||
def __set__(self, str key):
|
@ent_id_.setter
|
||||||
|
def ent_id_(self, str key):
|
||||||
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
|
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -843,28 +852,31 @@ cdef class Span:
|
||||||
"""RETURNS (str): The span's lemma."""
|
"""RETURNS (str): The span's lemma."""
|
||||||
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
|
||||||
|
|
||||||
property label_:
|
@property
|
||||||
|
def label_(self):
|
||||||
"""RETURNS (str): The span's label."""
|
"""RETURNS (str): The span's label."""
|
||||||
def __get__(self):
|
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
def __set__(self, str label_):
|
@label_.setter
|
||||||
|
def label_(self, str label_):
|
||||||
self.label = self.doc.vocab.strings.add(label_)
|
self.label = self.doc.vocab.strings.add(label_)
|
||||||
|
|
||||||
property kb_id_:
|
@property
|
||||||
|
def kb_id_(self):
|
||||||
"""RETURNS (str): The span's KB ID."""
|
"""RETURNS (str): The span's KB ID."""
|
||||||
def __get__(self):
|
|
||||||
return self.doc.vocab.strings[self.kb_id]
|
return self.doc.vocab.strings[self.kb_id]
|
||||||
|
|
||||||
def __set__(self, str kb_id_):
|
@kb_id_.setter
|
||||||
|
def kb_id_(self, str kb_id_):
|
||||||
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
self.kb_id = self.doc.vocab.strings.add(kb_id_)
|
||||||
|
|
||||||
property id_:
|
@property
|
||||||
|
def id_(self):
|
||||||
"""RETURNS (str): The span's ID."""
|
"""RETURNS (str): The span's ID."""
|
||||||
def __get__(self):
|
|
||||||
return self.doc.vocab.strings[self.id]
|
return self.doc.vocab.strings[self.id]
|
||||||
|
|
||||||
def __set__(self, str id_):
|
@id_.setter
|
||||||
|
def id_(self, str id_):
|
||||||
self.id = self.doc.vocab.strings.add(id_)
|
self.id = self.doc.vocab.strings.add(id_)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -249,11 +249,12 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return not self.c.morph == 0
|
return not self.c.morph == 0
|
||||||
|
|
||||||
property morph:
|
@property
|
||||||
def __get__(self):
|
def morph(self):
|
||||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||||
|
|
||||||
def __set__(self, MorphAnalysis morph):
|
@morph.setter
|
||||||
|
def morph(self, MorphAnalysis morph):
|
||||||
# Check that the morph has the same vocab
|
# Check that the morph has the same vocab
|
||||||
if self.vocab != morph.vocab:
|
if self.vocab != morph.vocab:
|
||||||
raise ValueError(Errors.E1013)
|
raise ValueError(Errors.E1013)
|
||||||
|
@ -377,38 +378,42 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.c.lex.suffix
|
return self.c.lex.suffix
|
||||||
|
|
||||||
property lemma:
|
@property
|
||||||
|
def lemma(self):
|
||||||
"""RETURNS (uint64): ID of the base form of the word, with no
|
"""RETURNS (uint64): ID of the base form of the word, with no
|
||||||
inflectional suffixes.
|
inflectional suffixes.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.c.lemma
|
return self.c.lemma
|
||||||
|
|
||||||
def __set__(self, attr_t lemma):
|
@lemma.setter
|
||||||
|
def lemma(self, attr_t lemma):
|
||||||
self.c.lemma = lemma
|
self.c.lemma = lemma
|
||||||
|
|
||||||
property pos:
|
@property
|
||||||
|
def pos(self):
|
||||||
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
|
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.pos
|
return self.c.pos
|
||||||
|
|
||||||
def __set__(self, pos):
|
@pos.setter
|
||||||
|
def pos(self, pos):
|
||||||
self.c.pos = pos
|
self.c.pos = pos
|
||||||
|
|
||||||
property tag:
|
@property
|
||||||
|
def tag(self):
|
||||||
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
|
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.tag
|
return self.c.tag
|
||||||
|
|
||||||
def __set__(self, attr_t tag):
|
@tag.setter
|
||||||
|
def tag(self, attr_t tag):
|
||||||
self.c.tag = tag
|
self.c.tag = tag
|
||||||
|
|
||||||
property dep:
|
@property
|
||||||
|
def dep(self):
|
||||||
"""RETURNS (uint64): ID of syntactic dependency label."""
|
"""RETURNS (uint64): ID of syntactic dependency label."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.dep
|
return self.c.dep
|
||||||
|
|
||||||
def __set__(self, attr_t label):
|
@dep.setter
|
||||||
|
def dep(self, attr_t label):
|
||||||
self.c.dep = label
|
self.c.dep = label
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -494,8 +499,8 @@ cdef class Token:
|
||||||
return self.doc.user_token_hooks["sent"](self)
|
return self.doc.user_token_hooks["sent"](self)
|
||||||
return self.doc[self.i : self.i+1].sent
|
return self.doc[self.i : self.i+1].sent
|
||||||
|
|
||||||
property sent_start:
|
@property
|
||||||
def __get__(self):
|
def sent_start(self):
|
||||||
"""Deprecated: use Token.is_sent_start instead."""
|
"""Deprecated: use Token.is_sent_start instead."""
|
||||||
# Raising a deprecation warning here causes errors for autocomplete
|
# Raising a deprecation warning here causes errors for autocomplete
|
||||||
# Handle broken backwards compatibility case: doc[0].sent_start
|
# Handle broken backwards compatibility case: doc[0].sent_start
|
||||||
|
@ -505,17 +510,18 @@ cdef class Token:
|
||||||
else:
|
else:
|
||||||
return self.c.sent_start
|
return self.c.sent_start
|
||||||
|
|
||||||
def __set__(self, value):
|
@sent_start.setter
|
||||||
|
def sent_start(self, value):
|
||||||
self.is_sent_start = value
|
self.is_sent_start = value
|
||||||
|
|
||||||
property is_sent_start:
|
@property
|
||||||
|
def is_sent_start(self):
|
||||||
"""A boolean value indicating whether the token starts a sentence.
|
"""A boolean value indicating whether the token starts a sentence.
|
||||||
`None` if unknown. Defaults to `True` for the first token in the `Doc`.
|
`None` if unknown. Defaults to `True` for the first token in the `Doc`.
|
||||||
|
|
||||||
RETURNS (bool / None): Whether the token starts a sentence.
|
RETURNS (bool / None): Whether the token starts a sentence.
|
||||||
None if unknown.
|
None if unknown.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
if self.c.sent_start == 0:
|
if self.c.sent_start == 0:
|
||||||
return None
|
return None
|
||||||
elif self.c.sent_start < 0:
|
elif self.c.sent_start < 0:
|
||||||
|
@ -523,7 +529,8 @@ cdef class Token:
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def __set__(self, value):
|
@is_sent_start.setter
|
||||||
|
def is_sent_start(self, value):
|
||||||
if self.doc.has_annotation("DEP"):
|
if self.doc.has_annotation("DEP"):
|
||||||
raise ValueError(Errors.E043)
|
raise ValueError(Errors.E043)
|
||||||
if value is None:
|
if value is None:
|
||||||
|
@ -535,7 +542,8 @@ cdef class Token:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E044.format(value=value))
|
raise ValueError(Errors.E044.format(value=value))
|
||||||
|
|
||||||
property is_sent_end:
|
@property
|
||||||
|
def is_sent_end(self):
|
||||||
"""A boolean value indicating whether the token ends a sentence.
|
"""A boolean value indicating whether the token ends a sentence.
|
||||||
`None` if unknown. Defaults to `True` for the last token in the `Doc`.
|
`None` if unknown. Defaults to `True` for the last token in the `Doc`.
|
||||||
|
|
||||||
|
@ -544,7 +552,6 @@ cdef class Token:
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/token#is_sent_end
|
DOCS: https://spacy.io/api/token#is_sent_end
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
if self.i + 1 == len(self.doc):
|
if self.i + 1 == len(self.doc):
|
||||||
return True
|
return True
|
||||||
elif self.doc[self.i+1].is_sent_start is None:
|
elif self.doc[self.i+1].is_sent_start is None:
|
||||||
|
@ -554,7 +561,8 @@ cdef class Token:
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def __set__(self, value):
|
@is_sent_end.setter
|
||||||
|
def is_sent_end(self, value):
|
||||||
raise ValueError(Errors.E196)
|
raise ValueError(Errors.E196)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -682,20 +690,21 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return not Token.missing_head(self.c)
|
return not Token.missing_head(self.c)
|
||||||
|
|
||||||
property head:
|
@property
|
||||||
|
def head(self):
|
||||||
"""The syntactic parent, or "governor", of this token.
|
"""The syntactic parent, or "governor", of this token.
|
||||||
If token.has_head() is `False`, this method will return itself.
|
If token.has_head() is `False`, this method will return itself.
|
||||||
|
|
||||||
RETURNS (Token): The token predicted by the parser to be the head of
|
RETURNS (Token): The token predicted by the parser to be the head of
|
||||||
the current token.
|
the current token.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
if not self.has_head():
|
if not self.has_head():
|
||||||
return self
|
return self
|
||||||
else:
|
else:
|
||||||
return self.doc[self.i + self.c.head]
|
return self.doc[self.i + self.c.head]
|
||||||
|
|
||||||
def __set__(self, Token new_head):
|
@head.setter
|
||||||
|
def head(self, Token new_head):
|
||||||
# This function sets the head of self to new_head and updates the
|
# This function sets the head of self to new_head and updates the
|
||||||
# counters for left/right dependents and left/right corner for the
|
# counters for left/right dependents and left/right corner for the
|
||||||
# new and the old head
|
# new and the old head
|
||||||
|
@ -744,20 +753,22 @@ cdef class Token:
|
||||||
queue.append(child)
|
queue.append(child)
|
||||||
return tuple([w for w in output if w.i != self.i])
|
return tuple([w for w in output if w.i != self.i])
|
||||||
|
|
||||||
property ent_type:
|
@property
|
||||||
|
def ent_type(self):
|
||||||
"""RETURNS (uint64): Named entity type."""
|
"""RETURNS (uint64): Named entity type."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.ent_type
|
return self.c.ent_type
|
||||||
|
|
||||||
def __set__(self, ent_type):
|
@ent_type.setter
|
||||||
|
def ent_type(self, ent_type):
|
||||||
self.c.ent_type = ent_type
|
self.c.ent_type = ent_type
|
||||||
|
|
||||||
property ent_type_:
|
@property
|
||||||
|
def ent_type_(self):
|
||||||
"""RETURNS (str): Named entity type."""
|
"""RETURNS (str): Named entity type."""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.ent_type]
|
return self.vocab.strings[self.c.ent_type]
|
||||||
|
|
||||||
def __set__(self, ent_type):
|
@ent_type_.setter
|
||||||
|
def ent_type_(self, ent_type):
|
||||||
self.c.ent_type = self.vocab.strings.add(ent_type)
|
self.c.ent_type = self.vocab.strings.add(ent_type)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -784,40 +795,44 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.iob_strings()[self.c.ent_iob]
|
return self.iob_strings()[self.c.ent_iob]
|
||||||
|
|
||||||
property ent_id:
|
@property
|
||||||
|
def ent_id(self):
|
||||||
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||||
if any.
|
if any.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.c.ent_id
|
return self.c.ent_id
|
||||||
|
|
||||||
def __set__(self, hash_t key):
|
@ent_id.setter
|
||||||
|
def ent_id(self, hash_t key):
|
||||||
self.c.ent_id = key
|
self.c.ent_id = key
|
||||||
|
|
||||||
property ent_id_:
|
@property
|
||||||
|
def ent_id_(self):
|
||||||
"""RETURNS (str): ID of the entity the token is an instance of,
|
"""RETURNS (str): ID of the entity the token is an instance of,
|
||||||
if any.
|
if any.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.ent_id]
|
return self.vocab.strings[self.c.ent_id]
|
||||||
|
|
||||||
def __set__(self, name):
|
@ent_id_.setter
|
||||||
|
def ent_id_(self, name):
|
||||||
self.c.ent_id = self.vocab.strings.add(name)
|
self.c.ent_id = self.vocab.strings.add(name)
|
||||||
|
|
||||||
property ent_kb_id:
|
@property
|
||||||
|
def ent_kb_id(self):
|
||||||
"""RETURNS (uint64): Named entity KB ID."""
|
"""RETURNS (uint64): Named entity KB ID."""
|
||||||
def __get__(self):
|
|
||||||
return self.c.ent_kb_id
|
return self.c.ent_kb_id
|
||||||
|
|
||||||
def __set__(self, attr_t ent_kb_id):
|
@ent_kb_id.setter
|
||||||
|
def ent_kb_id(self, attr_t ent_kb_id):
|
||||||
self.c.ent_kb_id = ent_kb_id
|
self.c.ent_kb_id = ent_kb_id
|
||||||
|
|
||||||
property ent_kb_id_:
|
@property
|
||||||
|
def ent_kb_id_(self):
|
||||||
"""RETURNS (str): Named entity KB ID."""
|
"""RETURNS (str): Named entity KB ID."""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.ent_kb_id]
|
return self.vocab.strings[self.c.ent_kb_id]
|
||||||
|
|
||||||
def __set__(self, ent_kb_id):
|
@ent_kb_id_.setter
|
||||||
|
def ent_kb_id_(self, ent_kb_id):
|
||||||
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
|
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -840,15 +855,16 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.lower]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
property norm_:
|
@property
|
||||||
|
def norm_(self):
|
||||||
"""RETURNS (str): The token's norm, i.e. a normalised form of the
|
"""RETURNS (str): The token's norm, i.e. a normalised form of the
|
||||||
token text. Usually set in the language's tokenizer exceptions or
|
token text. Usually set in the language's tokenizer exceptions or
|
||||||
norm exceptions.
|
norm exceptions.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.norm]
|
return self.vocab.strings[self.norm]
|
||||||
|
|
||||||
def __set__(self, str norm_):
|
@norm_.setter
|
||||||
|
def norm_(self, str norm_):
|
||||||
self.c.norm = self.vocab.strings.add(norm_)
|
self.c.norm = self.vocab.strings.add(norm_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -879,32 +895,35 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.vocab.strings[self.c.lex.lang]
|
return self.vocab.strings[self.c.lex.lang]
|
||||||
|
|
||||||
property lemma_:
|
@property
|
||||||
|
def lemma_(self):
|
||||||
"""RETURNS (str): The token lemma, i.e. the base form of the word,
|
"""RETURNS (str): The token lemma, i.e. the base form of the word,
|
||||||
with no inflectional suffixes.
|
with no inflectional suffixes.
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
||||||
def __set__(self, str lemma_):
|
@lemma_.setter
|
||||||
|
def lemma_(self, str lemma_):
|
||||||
self.c.lemma = self.vocab.strings.add(lemma_)
|
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||||
|
|
||||||
property pos_:
|
@property
|
||||||
|
def pos_(self):
|
||||||
"""RETURNS (str): Coarse-grained part-of-speech tag."""
|
"""RETURNS (str): Coarse-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
|
||||||
return parts_of_speech.NAMES[self.c.pos]
|
return parts_of_speech.NAMES[self.c.pos]
|
||||||
|
|
||||||
def __set__(self, pos_name):
|
@pos_.setter
|
||||||
|
def pos_(self, pos_name):
|
||||||
if pos_name not in parts_of_speech.IDS:
|
if pos_name not in parts_of_speech.IDS:
|
||||||
raise ValueError(Errors.E1021.format(pp=pos_name))
|
raise ValueError(Errors.E1021.format(pp=pos_name))
|
||||||
self.c.pos = parts_of_speech.IDS[pos_name]
|
self.c.pos = parts_of_speech.IDS[pos_name]
|
||||||
|
|
||||||
property tag_:
|
@property
|
||||||
|
def tag_(self):
|
||||||
"""RETURNS (str): Fine-grained part-of-speech tag."""
|
"""RETURNS (str): Fine-grained part-of-speech tag."""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.tag]
|
return self.vocab.strings[self.c.tag]
|
||||||
|
|
||||||
def __set__(self, tag):
|
@tag_.setter
|
||||||
|
def tag_(self, tag):
|
||||||
self.tag = self.vocab.strings.add(tag)
|
self.tag = self.vocab.strings.add(tag)
|
||||||
|
|
||||||
def has_dep(self):
|
def has_dep(self):
|
||||||
|
@ -915,12 +934,13 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return not Token.missing_dep(self.c)
|
return not Token.missing_dep(self.c)
|
||||||
|
|
||||||
property dep_:
|
@property
|
||||||
|
def dep_(self):
|
||||||
"""RETURNS (str): The syntactic dependency label."""
|
"""RETURNS (str): The syntactic dependency label."""
|
||||||
def __get__(self):
|
|
||||||
return self.vocab.strings[self.c.dep]
|
return self.vocab.strings[self.c.dep]
|
||||||
|
|
||||||
def __set__(self, str label):
|
@dep_.setter
|
||||||
|
def dep_(self, str label):
|
||||||
self.c.dep = self.vocab.strings.add(label)
|
self.c.dep = self.vocab.strings.add(label)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -88,20 +88,22 @@ cdef class Example:
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.predicted)
|
return len(self.predicted)
|
||||||
|
|
||||||
property predicted:
|
@property
|
||||||
def __get__(self):
|
def predicted(self):
|
||||||
return self.x
|
return self.x
|
||||||
|
|
||||||
def __set__(self, doc):
|
@predicted.setter
|
||||||
|
def predicted(self, doc):
|
||||||
self.x = doc
|
self.x = doc
|
||||||
self._cached_alignment = None
|
self._cached_alignment = None
|
||||||
self._cached_words_x = [t.text for t in doc]
|
self._cached_words_x = [t.text for t in doc]
|
||||||
|
|
||||||
property reference:
|
@property
|
||||||
def __get__(self):
|
def reference(self):
|
||||||
return self.y
|
return self.y
|
||||||
|
|
||||||
def __set__(self, doc):
|
@reference.setter
|
||||||
|
def reference(self, doc):
|
||||||
self.y = doc
|
self.y = doc
|
||||||
self._cached_alignment = None
|
self._cached_alignment = None
|
||||||
self._cached_words_y = [t.text for t in doc]
|
self._cached_words_y = [t.text for t in doc]
|
||||||
|
@ -420,8 +422,8 @@ cdef class Example:
|
||||||
seen_indices.update(indices)
|
seen_indices.update(indices)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
property text:
|
@property
|
||||||
def __get__(self):
|
def text(self):
|
||||||
return self.x.text
|
return self.x.text
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
|
|
@ -41,7 +41,9 @@ cdef class Vocab:
|
||||||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||||
|
|
||||||
cdef PreshMap _by_orth
|
cdef PreshMap _by_orth
|
||||||
|
cdef Pool _non_temp_mem
|
||||||
|
cdef vector[attr_t] _transient_orths
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
||||||
|
|
||||||
|
from cymem.cymem import Pool
|
||||||
from thinc.types import Floats1d, FloatsXd
|
from thinc.types import Floats1d, FloatsXd
|
||||||
|
|
||||||
from . import Language
|
from . import Language
|
||||||
|
@ -67,6 +69,8 @@ class Vocab:
|
||||||
def from_bytes(
|
def from_bytes(
|
||||||
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
||||||
) -> Vocab: ...
|
) -> Vocab: ...
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
|
||||||
|
|
||||||
def pickle_vocab(vocab: Vocab) -> Any: ...
|
def pickle_vocab(vocab: Vocab) -> Any: ...
|
||||||
def unpickle_vocab(
|
def unpickle_vocab(
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
import functools
|
import functools
|
||||||
|
from contextlib import ExitStack, contextmanager
|
||||||
|
from typing import Iterator, Optional
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import get_array_module, get_current_ops
|
from thinc.api import get_array_module, get_current_ops
|
||||||
|
from preshed.maps cimport map_clear
|
||||||
|
|
||||||
from .attrs cimport LANG, ORTH
|
from .attrs cimport LANG, ORTH
|
||||||
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
|
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
|
||||||
|
@ -87,15 +90,22 @@ cdef class Vocab:
|
||||||
self.lookups = lookups
|
self.lookups = lookups
|
||||||
self.writing_system = writing_system
|
self.writing_system = writing_system
|
||||||
self.get_noun_chunks = get_noun_chunks
|
self.get_noun_chunks = get_noun_chunks
|
||||||
|
# During a memory_zone we replace our mem object with one
|
||||||
|
# that's passed to us. We keep a reference to our non-temporary
|
||||||
|
# memory here, in case we need to make an allocation we want to
|
||||||
|
# guarantee is not temporary. This is also how we check whether
|
||||||
|
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
|
||||||
|
self._non_temp_mem = self.mem
|
||||||
|
|
||||||
property vectors:
|
@property
|
||||||
def __get__(self):
|
def vectors(self):
|
||||||
return self._vectors
|
return self._vectors
|
||||||
|
|
||||||
def __set__(self, vectors):
|
@vectors.setter
|
||||||
|
def vectors(self, vectors):
|
||||||
if hasattr(vectors, "strings"):
|
if hasattr(vectors, "strings"):
|
||||||
for s in vectors.strings:
|
for s in vectors.strings:
|
||||||
self.strings.add(s)
|
self.strings.add(s, allow_transient=False)
|
||||||
self._vectors = vectors
|
self._vectors = vectors
|
||||||
self._vectors.strings = self.strings
|
self._vectors.strings = self.strings
|
||||||
|
|
||||||
|
@ -106,6 +116,10 @@ cdef class Vocab:
|
||||||
langfunc = self.lex_attr_getters.get(LANG, None)
|
langfunc = self.lex_attr_getters.get(LANG, None)
|
||||||
return langfunc("_") if langfunc else ""
|
return langfunc("_") if langfunc else ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def in_memory_zone(self) -> bool:
|
||||||
|
return self.mem is not self._non_temp_mem
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored.
|
"""The current number of lexemes stored.
|
||||||
|
|
||||||
|
@ -113,6 +127,33 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||||
|
"""Begin a block where resources allocated during the block will
|
||||||
|
be freed at the end of it. If a resources was created within the
|
||||||
|
memory zone block, accessing it outside the block is invalid.
|
||||||
|
Behaviour of this invalid access is undefined. Memory zones should
|
||||||
|
not be nested.
|
||||||
|
|
||||||
|
The memory zone is helpful for services that need to process large
|
||||||
|
volumes of text with a defined memory budget.
|
||||||
|
"""
|
||||||
|
if mem is None:
|
||||||
|
mem = Pool()
|
||||||
|
# The ExitStack allows programmatic nested context managers.
|
||||||
|
# We don't know how many we need, so it would be awkward to have
|
||||||
|
# them as nested blocks.
|
||||||
|
with ExitStack() as stack:
|
||||||
|
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
|
||||||
|
if hasattr(self.morphology, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
|
||||||
|
if hasattr(self._vectors, "memory_zone"):
|
||||||
|
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
|
||||||
|
self.mem = mem
|
||||||
|
yield mem
|
||||||
|
self._clear_transient_orths()
|
||||||
|
self.mem = self._non_temp_mem
|
||||||
|
|
||||||
def add_flag(self, flag_getter, int flag_id=-1):
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
"""Set a new boolean flag to words in the vocabulary.
|
"""Set a new boolean flag to words in the vocabulary.
|
||||||
|
|
||||||
|
@ -147,8 +188,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
||||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
`Lexeme` if necessary.
|
||||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
|
||||||
"""
|
"""
|
||||||
if string == "":
|
if string == "":
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
|
@ -179,19 +219,11 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, self.strings[orth])
|
return self._new_lexeme(mem, self.strings[orth])
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
||||||
# I think this heuristic is bad, and the Vocab should always
|
# The mem argument is deprecated, replaced by memory zones. Same with
|
||||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
# this size heuristic.
|
||||||
# was originally supposed to work. The best solution to the growing
|
|
||||||
# memory use is to periodically reset the vocab, which is an action
|
|
||||||
# that should be up to the user to do (so we don't need to keep track
|
|
||||||
# of the doc ownership).
|
|
||||||
# TODO: Change the C API so that the mem isn't passed in here.
|
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
# if len(string) < 3 or self.length < 10000:
|
|
||||||
# mem = self.mem
|
|
||||||
cdef bint is_oov = mem is not self.mem
|
|
||||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||||
lex.orth = self.strings.add(string)
|
lex.orth = self.strings.add(string, allow_transient=True)
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
if self.vectors is not None and hasattr(self.vectors, "key2row"):
|
if self.vectors is not None and hasattr(self.vectors, "key2row"):
|
||||||
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
|
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
|
||||||
|
@ -201,18 +233,25 @@ cdef class Vocab:
|
||||||
for attr, func in self.lex_attr_getters.items():
|
for attr, func in self.lex_attr_getters.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
value = self.strings.add(value)
|
value = self.strings.add(value, allow_transient=True)
|
||||||
if value is not None:
|
if value is not None:
|
||||||
Lexeme.set_struct_attr(lex, attr, value)
|
Lexeme.set_struct_attr(lex, attr, value)
|
||||||
if not is_oov:
|
self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
|
||||||
self._add_lex_to_vocab(lex.orth, lex)
|
|
||||||
if lex == NULL:
|
if lex == NULL:
|
||||||
raise ValueError(Errors.E085.format(string=string))
|
raise ValueError(Errors.E085.format(string=string))
|
||||||
return lex
|
return lex
|
||||||
|
|
||||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
|
||||||
self._by_orth.set(lex.orth, <void*>lex)
|
self._by_orth.set(lex.orth, <void*>lex)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
if is_transient and self.in_memory_zone:
|
||||||
|
self._transient_orths.push_back(lex.orth)
|
||||||
|
|
||||||
|
def _clear_transient_orths(self):
|
||||||
|
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
|
||||||
|
for orth in self._transient_orths:
|
||||||
|
map_clear(self._by_orth.c_map, orth)
|
||||||
|
self._transient_orths.clear()
|
||||||
|
|
||||||
def __contains__(self, key):
|
def __contains__(self, key):
|
||||||
"""Check whether the string or int key has an entry in the vocabulary.
|
"""Check whether the string or int key has an entry in the vocabulary.
|
||||||
|
@ -264,7 +303,7 @@ cdef class Vocab:
|
||||||
"""
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if isinstance(id_or_string, str):
|
if isinstance(id_or_string, str):
|
||||||
orth = self.strings.add(id_or_string)
|
orth = self.strings.add(id_or_string, allow_transient=True)
|
||||||
else:
|
else:
|
||||||
orth = id_or_string
|
orth = id_or_string
|
||||||
return Lexeme(self, orth)
|
return Lexeme(self, orth)
|
||||||
|
@ -416,7 +455,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#get_vector
|
DOCS: https://spacy.io/api/vocab#get_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth, allow_transient=True)
|
||||||
cdef Lexeme lex = self[orth]
|
cdef Lexeme lex = self[orth]
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
if self.has_vector(key):
|
if self.has_vector(key):
|
||||||
|
@ -435,7 +474,7 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#set_vector
|
DOCS: https://spacy.io/api/vocab#set_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth, allow_transient=False)
|
||||||
cdef Lexeme lex = self[orth]
|
cdef Lexeme lex = self[orth]
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
if self.vectors.is_full and key not in self.vectors:
|
if self.vectors.is_full and key not in self.vectors:
|
||||||
|
@ -459,16 +498,17 @@ cdef class Vocab:
|
||||||
DOCS: https://spacy.io/api/vocab#has_vector
|
DOCS: https://spacy.io/api/vocab#has_vector
|
||||||
"""
|
"""
|
||||||
if isinstance(orth, str):
|
if isinstance(orth, str):
|
||||||
orth = self.strings.add(orth)
|
orth = self.strings.add(orth, allow_transient=True)
|
||||||
cdef Lexeme lex = self[orth]
|
cdef Lexeme lex = self[orth]
|
||||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||||
return key in self.vectors
|
return key in self.vectors
|
||||||
|
|
||||||
property lookups:
|
@property
|
||||||
def __get__(self):
|
def lookups(self):
|
||||||
return self._lookups
|
return self._lookups
|
||||||
|
|
||||||
def __set__(self, lookups):
|
@lookups.setter
|
||||||
|
def lookups(self, lookups):
|
||||||
self._lookups = lookups
|
self._lookups = lookups
|
||||||
if lookups.has_table("lexeme_norm"):
|
if lookups.has_table("lexeme_norm"):
|
||||||
self.lex_attr_getters[NORM] = util.add_lookups(
|
self.lex_attr_getters[NORM] = util.add_lookups(
|
||||||
|
|
|
@ -46,10 +46,10 @@ as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
|
||||||
appending `_` as in `token.dep_`.
|
appending `_` as in `token.dep_`.
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `DEP` | The token's dependency label. ~~str~~ |
|
| `DEP` | The token's dependency label. ~~str~~ |
|
||||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||||
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
|
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
|
||||||
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
|
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
|
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
|
||||||
|
|
|
@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
|
||||||
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
||||||
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
||||||
✔ Good amount of examples for all labels
|
✔ Good amount of examples for all labels
|
||||||
✔ Examples without occurences available for all labels
|
✔ Examples without occurrences available for all labels
|
||||||
✔ No entities consisting of or starting/ending with whitespace
|
✔ No entities consisting of or starting/ending with whitespace
|
||||||
|
|
||||||
=========================== Part-of-speech Tagging ===========================
|
=========================== Part-of-speech Tagging ===========================
|
||||||
|
@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
|
||||||
|
|
||||||
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
||||||
|
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
|
|
@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters.
|
||||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||||
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
|
||||||
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
|
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ |
|
||||||
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
|
||||||
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
|
||||||
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||||
|
@ -101,7 +101,7 @@ custom knowledge base, you should either call
|
||||||
[`initialize`](/api/entitylinker#initialize) call.
|
[`initialize`](/api/entitylinker#initialize) call.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
|
@ -114,7 +114,7 @@ custom knowledge base, you should either call
|
||||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
|
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
|
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
|
||||||
with `overwrite_ents=True`, existing entities will be replaced if they overlap
|
with `overwrite_ents=True`, existing entities will be replaced if they overlap
|
||||||
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
|
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
|
||||||
longer patterns over shorter, and if equal the match occuring first in the Doc
|
longer patterns over shorter, and if equal the match occurring first in the Doc
|
||||||
is chosen.
|
is chosen.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -148,8 +148,9 @@ Whether a feature/value pair is in the analysis.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ----------- | --------------------------------------------- |
|
| ------------ | --------------------------------------------------------------------- |
|
||||||
| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
|
| `feature` | A feature/value pair. ~~str~~ |
|
||||||
|
| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ |
|
||||||
|
|
||||||
### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}
|
### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative
|
||||||
clauses.
|
clauses.
|
||||||
|
|
||||||
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
||||||
has not been implemeted for the given language, a `NotImplementedError` is
|
has not been implemented for the given language, a `NotImplementedError` is
|
||||||
raised.
|
raised.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
|
||||||
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||||
| `width` | The width of the last hidden layer. ~~int~~ |
|
| `width` | The width of the last hidden layer. ~~int~~ |
|
||||||
|
|
||||||
### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
|
### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
|
||||||
|
|
||||||
Create an empty `TransformerData` container.
|
Create an empty `TransformerData` container.
|
||||||
|
|
||||||
|
|
|
@ -832,7 +832,7 @@ retrieve and add to them.
|
||||||
|
|
||||||
After creation, the component needs to be
|
After creation, the component needs to be
|
||||||
[initialized](/usage/training#initialization). This method can define the
|
[initialized](/usage/training#initialization). This method can define the
|
||||||
relevant labels in two ways: explicitely by setting the `labels` argument in the
|
relevant labels in two ways: explicitly by setting the `labels` argument in the
|
||||||
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
||||||
implicately by deducing them from the `get_examples` callback that generates the
|
implicately by deducing them from the `get_examples` callback that generates the
|
||||||
full **training data set**, or a representative sample.
|
full **training data set**, or a representative sample.
|
||||||
|
|
|
@ -1899,7 +1899,7 @@ the two words.
|
||||||
"Shore": ("coast", 0.732257),
|
"Shore": ("coast", 0.732257),
|
||||||
"Precautionary": ("caution", 0.490973),
|
"Precautionary": ("caution", 0.490973),
|
||||||
"hopelessness": ("sadness", 0.742366),
|
"hopelessness": ("sadness", 0.742366),
|
||||||
"Continous": ("continuous", 0.732549),
|
"Continuous": ("continuous", 0.732549),
|
||||||
"Disemboweled": ("corpse", 0.499432),
|
"Disemboweled": ("corpse", 0.499432),
|
||||||
"biostatistician": ("scientist", 0.339724),
|
"biostatistician": ("scientist", 0.339724),
|
||||||
"somewheres": ("somewheres", 0.402736),
|
"somewheres": ("somewheres", 0.402736),
|
||||||
|
|
|
@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
|
||||||
dependency check, set `check_requirements: false` in your project's
|
dependency check, set `check_requirements: false` in your project's
|
||||||
`project.yml`.
|
`project.yml`.
|
||||||
|
|
||||||
### 4. Run a workflow {id="run-workfow"}
|
### 4. Run a workflow {id="run-workflow"}
|
||||||
|
|
||||||
> #### project.yml
|
> #### project.yml
|
||||||
>
|
>
|
||||||
|
@ -286,7 +286,7 @@ pipelines.
|
||||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
|
|
|
@ -306,7 +306,9 @@ installed in the same environment – that's it.
|
||||||
|
|
||||||
### Loading probability tables into existing models
|
### Loading probability tables into existing models
|
||||||
|
|
||||||
You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
|
You can load a probability table from
|
||||||
|
[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
|
||||||
|
existing spaCy model like `en_core_web_sm`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Requirements: pip install spacy-lookups-data
|
# Requirements: pip install spacy-lookups-data
|
||||||
|
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
|
||||||
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
|
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
|
||||||
```
|
```
|
||||||
|
|
||||||
When training a model from scratch you can also specify probability tables in the `config.cfg`.
|
When training a model from scratch you can also specify probability tables in
|
||||||
|
the `config.cfg`.
|
||||||
|
|
||||||
```ini {title="config.cfg (excerpt)"}
|
```ini {title="config.cfg (excerpt)"}
|
||||||
[initialize.lookups]
|
[initialize.lookups]
|
||||||
|
@ -346,8 +349,8 @@ them**!
|
||||||
To stick with the theme of
|
To stick with the theme of
|
||||||
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
consider the following custom spaCy
|
consider the following custom spaCy
|
||||||
[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
|
[pipeline component](/usage/processing-pipelines#custom-components) that prints
|
||||||
snake when it's called:
|
a snake when it's called:
|
||||||
|
|
||||||
> #### Package directory structure
|
> #### Package directory structure
|
||||||
>
|
>
|
||||||
|
|
|
@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
|
||||||
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
||||||
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
||||||
✔ Good amount of examples for all labels
|
✔ Good amount of examples for all labels
|
||||||
✔ Examples without occurences available for all labels
|
✔ Examples without occurrences available for all labels
|
||||||
✔ No entities consisting of or starting/ending with whitespace
|
✔ No entities consisting of or starting/ending with whitespace
|
||||||
|
|
||||||
=========================== Part-of-speech Tagging ===========================
|
=========================== Part-of-speech Tagging ===========================
|
||||||
|
|
|
@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
|
||||||
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
|
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
|
||||||
in the [transformer API docs](/api/architectures#TransformerModel).
|
in the [transformer API docs](/api/architectures#TransformerModel).
|
||||||
|
|
||||||
`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
|
`spacy-transformers` v1.1 also adds support for `transformer_config` settings
|
||||||
such as `output_attentions`. Additional output is stored under
|
such as `output_attentions`. Additional output is stored under
|
||||||
`TransformerData.model_output`. More details are in the
|
`TransformerData.model_output`. More details are in the
|
||||||
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
|
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
|
||||||
|
|
|
@ -31,6 +31,12 @@
|
||||||
"name": "Bengali",
|
"name": "Bengali",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"code": "bo",
|
||||||
|
"name": "Tibetan",
|
||||||
|
"example": "འདི་ཚིག་གྲུབ་རེད།",
|
||||||
|
"has_examples": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"code": "ca",
|
"code": "ca",
|
||||||
"name": "Catalan",
|
"name": "Catalan",
|
||||||
|
@ -480,6 +486,12 @@
|
||||||
],
|
],
|
||||||
"example": "这是一个用于示例的句子。",
|
"example": "这是一个用于示例的句子。",
|
||||||
"has_examples": true
|
"has_examples": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"code": "kmr",
|
||||||
|
"name": "Kurdish Kurmanji",
|
||||||
|
"example": "Ev hevokek e",
|
||||||
|
"has_examples": true
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"licenses": [
|
"licenses": [
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
||||||
}
|
}
|
||||||
|
|
||||||
const navAlert = (
|
const navAlert = (
|
||||||
<Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
|
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
|
||||||
💥 Interested in <strong>Premium spaCy Models</strong>?
|
💥 <strong>New:</strong> Case study with S&P Global
|
||||||
</Link>
|
</Link>
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user