Merge branch 'master' into pr/13418

This commit is contained in:
Ines Montani 2024-09-10 14:27:01 +02:00
commit 37dd13a96b
69 changed files with 5515 additions and 1063 deletions

92
.github/workflows/cibuildwheel.yml vendored Normal file
View File

@ -0,0 +1,92 @@
name: Build
on:
push:
tags:
# ytf did they invent their own syntax that's almost regex?
# ** matches 'zero or more of any character'
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
jobs:
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
# macos-13 is an intel runner, macos-14 is apple silicon
os: [ubuntu-latest, windows-latest, macos-13]
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: pypa/cibuildwheel@v2.19.1
env:
CIBW_SOME_OPTION: value
with:
package-dir: .
output-dir: wheelhouse
config-file: "{package}/pyproject.toml"
- uses: actions/upload-artifact@v4
with:
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
path: ./wheelhouse/*.whl
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
run: pipx run build --sdist
- uses: actions/upload-artifact@v4
with:
name: cibw-sdist
path: dist/*.tar.gz
create_release:
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
permissions:
contents: write
checks: write
actions: read
issues: read
packages: write
pull-requests: read
repository-projects: read
statuses: read
steps:
- name: Get the tag name and determine if it's a prerelease
id: get_tag_info
run: |
FULL_TAG=${GITHUB_REF#refs/tags/}
if [[ $FULL_TAG == release-* ]]; then
TAG_NAME=${FULL_TAG#release-}
IS_PRERELEASE=false
elif [[ $FULL_TAG == prerelease-* ]]; then
TAG_NAME=${FULL_TAG#prerelease-}
IS_PRERELEASE=true
else
echo "Tag does not match expected patterns" >&2
exit 1
fi
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
# unpacks all CIBW artifacts into dist/
pattern: cibw-*
path: dist
merge-multiple: true
- name: Create Draft Release
id: create_release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
name: ${{ env.TAG_NAME }}
draft: true
prerelease: ${{ env.IS_PRERELEASE }}
files: "./dist/*"

View File

@ -15,7 +15,7 @@ jobs:
env:
GITHUB_CONTEXT: ${{ toJson(github) }}
run: echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- name: Install and run explosion-bot
run: |

View File

@ -16,7 +16,7 @@ jobs:
if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest
steps:
- uses: dessant/lock-threads@v4
- uses: dessant/lock-threads@v5
with:
process-only: 'issues'
issue-inactive-days: '30'

29
.github/workflows/publish_pypi.yml vendored Normal file
View File

@ -0,0 +1,29 @@
# The cibuildwheel action triggers on creation of a release, this
# triggers on publication.
# The expected workflow is to create a draft release and let the wheels
# upload, and then hit 'publish', which uploads to PyPi.
on:
release:
types:
- published
jobs:
upload_pypi:
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/spacy
permissions:
id-token: write
contents: read
if: github.event_name == 'release' && github.event.action == 'published'
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
steps:
- uses: robinraju/release-downloader@v1
with:
tag: ${{ github.event.release.tag_name }}
fileName: '*'
out-file-path: 'dist'
- uses: pypa/gh-action-pypi-publish@release/v1

View File

@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
ref: ${{ matrix.branch }}
- name: Get commits from past 24 hours

View File

@ -18,7 +18,7 @@ jobs:
run: |
echo "$GITHUB_CONTEXT"
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.10'

View File

@ -25,13 +25,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: black
run: |
@ -75,13 +74,12 @@ jobs:
steps:
- name: Check out repo
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}
architecture: x64
- name: Install dependencies
run: |

View File

@ -20,13 +20,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Check out repo
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Configure Python version
uses: actions/setup-python@v4
with:
python-version: "3.7"
architecture: x64
- name: Validate website/meta/universe.json
run: |

View File

@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -11,5 +11,58 @@ requires = [
]
build-backend = "setuptools.build_meta"
[tool.cibuildwheel]
build = "*"
skip = "pp* cp36* cp37* cp38* *-win32"
test-skip = ""
free-threaded-support = false
archs = ["native"]
build-frontend = "default"
config-settings = {}
dependency-versions = "pinned"
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
environment-pass = []
build-verbosity = 0
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
before-build = "pip install -r requirements.txt && python setup.py clean"
repair-wheel-command = ""
test-command = ""
before-test = ""
test-requires = []
test-extras = []
container-engine = "docker"
manylinux-x86_64-image = "manylinux2014"
manylinux-i686-image = "manylinux2014"
manylinux-aarch64-image = "manylinux2014"
manylinux-ppc64le-image = "manylinux2014"
manylinux-s390x-image = "manylinux2014"
manylinux-pypy_x86_64-image = "manylinux2014"
manylinux-pypy_i686-image = "manylinux2014"
manylinux-pypy_aarch64-image = "manylinux2014"
musllinux-x86_64-image = "musllinux_1_2"
musllinux-i686-image = "musllinux_1_2"
musllinux-aarch64-image = "musllinux_1_2"
musllinux-ppc64le-image = "musllinux_1_2"
musllinux-s390x-image = "musllinux_1_2"
[tool.cibuildwheel.linux]
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
[tool.cibuildwheel.macos]
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
[tool.cibuildwheel.windows]
[tool.cibuildwheel.pyodide]
[tool.isort]
profile = "black"

View File

@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.10.0
typer>=0.3.0,<1.0.0
weasel>=0.1.0,<0.5.0
# Third party dependencies
numpy>=1.15.0; python_version < "3.9"
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0

View File

@ -22,6 +22,7 @@ classifiers =
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
@ -55,7 +56,7 @@ install_requires =
catalogue>=2.0.6,<2.1.0
weasel>=0.1.0,<0.5.0
# Third-party dependencies
typer>=0.3.0,<0.10.0
typer>=0.3.0,<1.0.0
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0; python_version < "3.9"
numpy>=1.19.0; python_version >= "3.9"
@ -65,7 +66,6 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]

View File

@ -1,5 +1,5 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.7.4"
__version__ = "3.8.0.dev0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -39,7 +39,7 @@ def find_threshold_cli(
# fmt: on
):
"""
Runs prediction trials for a trained model with varying tresholds to maximize
Runs prediction trials for a trained model with varying thresholds to maximize
the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
@ -81,7 +81,7 @@ def find_threshold(
silent: bool = True,
) -> Tuple[float, float, Dict[float, float]]:
"""
Runs prediction trials for models with varying tresholds to maximize the specified metric.
Runs prediction trials for models with varying thresholds to maximize the specified metric.
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
data_path (Path): Path to file with DocBin with docs to use for threshold search.
pipe_name (str): Name of pipe to examine thresholds for.

16
spacy/lang/bo/__init__.py Normal file
View File

@ -0,0 +1,16 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
class TibetanDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tibetan(Language):
lang = "bo"
Defaults = TibetanDefaults
__all__ = ["Tibetan"]

16
spacy/lang/bo/examples.py Normal file
View File

@ -0,0 +1,16 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.bo.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
]

View File

@ -0,0 +1,65 @@
from ...attrs import LIKE_NUM
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
_num_words = [
"ཀླད་ཀོར་",
"གཅིག་",
"གཉིས་",
"གསུམ་",
"བཞི་",
"ལྔ་",
"དྲུག་",
"བདུན་",
"བརྒྱད་",
"དགུ་",
"བཅུ་",
"བཅུ་གཅིག་",
"བཅུ་གཉིས་",
"བཅུ་གསུམ་",
"བཅུ་བཞི་",
"བཅུ་ལྔ་",
"བཅུ་དྲུག་",
"བཅུ་བདུན་",
"བཅུ་པརྒྱད",
"བཅུ་དགུ་",
"ཉི་ཤུ་",
"སུམ་ཅུ",
"བཞི་བཅུ",
"ལྔ་བཅུ",
"དྲུག་ཅུ",
"བདུན་ཅུ",
"བརྒྱད་ཅུ",
"དགུ་བཅུ",
"བརྒྱ་",
"སྟོང་",
"ཁྲི་",
"ས་ཡ་",
" བྱེ་བ་",
"དུང་ཕྱུར་",
"ཐེར་འབུམ་",
"ཐེར་འབུམ་ཆེན་པོ་",
"ཁྲག་ཁྲིག་",
"ཁྲག་ཁྲིག་ཆེན་པོ་",
]
def like_num(text):
"""
Check if text resembles a number
"""
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

198
spacy/lang/bo/stop_words.py Normal file
View File

@ -0,0 +1,198 @@
# Source: https://zenodo.org/records/10148636
STOP_WORDS = set(
"""
གས
མས
འད
པས
གཞན
དང
གས
བཅས
ངས
ལས
ཙམ
ཡང
མཐའདག
འད
རང
ངམ
དག
འང
ལགས
ཚང
ཐམསཅད
དམ
འམ
བས
ལགས
གས
མས
བམ
ནམ
ནམ
ངམ
འགའ
ཤས
གམ
ལགས
ཅང
འགའ
སམ
འང
ལས
འཕ
བར
དང
འག
སམ
ཟད
འམ
མམ
དམ
དག
ལམ
ནང
ཙམ
རམ
ཨང
གས
ལགས
པས
རབ
རམ
བས
གཞན
འབའ
གམ
བམ
ཙམ
མམ
ཏམ
ཏམ
ཤས
""".split()
)

18
spacy/lang/gd/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from typing import Optional
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ScottishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Scottish(Language):
lang = "gd"
Defaults = ScottishDefaults
__all__ = ["Scottish"]

388
spacy/lang/gd/stop_words.py Normal file
View File

@ -0,0 +1,388 @@
STOP_WORDS = set(
"""
'ad
'ar
'd # iad
'g # ag
'ga
'gam
'gan
'gar
'gur
'm # am
'n # an
'n seo
'na
'nad
'nam
'nan
'nar
'nuair
'nur
's
'sa
'san
'sann
'se
'sna
a
a'
a'd # agad
a'm # agam
a-chèile
a-seo
a-sin
a-siud
a chionn
a chionn 's
a chèile
a chéile
a dh'
a h-uile
a seo
ac' # aca
aca
aca-san
acasan
ach
ag
agad
agad-sa
agads'
agadsa
agaibh
agaibhse
againn
againne
agam
agam-sa
agams'
agamsa
agus
aice
aice-se
aicese
aig
aig' # aige
aige
aige-san
aigesan
air
air-san
air neo
airsan
am
an
an seo
an sin
an siud
an uair
ann
ann a
ann a'
ann a shin
ann am
ann an
annad
annam
annam-s'
annamsa
anns
anns an
annta
aon
ar
as
asad
asda
asta
b'
bho
bhon
bhuaidhe # bhuaithe
bhuainn
bhuaipe
bhuaithe
bhuapa
bhur
brì
bu
c'à
car son
carson
cha
chan
chionn
choir
chon
chun
chèile
chéile
chòir
cia mheud
ciamar
co-dhiubh
cuide
cuin
cuin'
cuine
'
càil
càit
càit'
càite
mheud
d'
da
de
dh'
dha
dhaibh
dhaibh-san
dhaibhsan
dhan
dhasan
dhe
dhen
dheth
dhi
dhiom
dhiot
dhith
dhiubh
dhomh
dhomh-s'
dhomhsa
dhu'sa # dhut-sa
dhuibh
dhuibhse
dhuinn
dhuinne
dhuit
dhut
dhutsa
dhut-sa
dhà
dhà-san
dhàsan
dhòmhsa
diubh
do
docha
don
mar
mar
dòch'
dòcha
e
eadar
eatarra
eatorra
eile
esan
fa
far
feud
fhad
fheudar
fhearr
fhein
fheudar
fheàrr
fhèin
fhéin
fhìn
fo
fodha
fodhainn
foipe
fon
fèin
ga
gach
gam
gan
ge brith
ged
gu
gu
gu ruige
gun
gur
gus
i
iad
iadsan
innte
is
ise
le
leam
leam-sa
leamsa
leat
leat-sa
leatha
leatsa
leibh
leis
leis-san
leoth'
leotha
leotha-san
linn
m'
m'a
ma
mac
man
mar
mas
mathaid
mi
mis'
mise
mo
mu
mu 'n
mun
mur
mura
mus
na
na b'
na bu
na iad
nach
nad
nam
nan
nar
nas
neo
no
nuair
o
o'n
oir
oirbh
oirbh-se
oirnn
oirnne
oirre
on
orm
orm-sa
ormsa
orra
orra-san
orrasan
ort
os
r'
ri
ribh
rinn
ris
rithe
rithe-se
rium
rium-sa
riums'
riumsa
riut
riuth'
riutha
riuthasan
ro
ro'n
roimh
roimhe
romhainn
romham
romhpa
ron
ruibh
ruinn
ruinne
sa
san
sann
se
seach
seo
seothach
shin
sibh
sibh-se
sibhse
sin
sineach
sinn
sinne
siod
siodach
siud
siudach
sna # ann an
t'
tarsaing
tarsainn
tarsuinn
thar
thoigh
thro
thu
thuc'
thuca
thugad
thugaibh
thugainn
thugam
thugamsa
thuice
thuige
thus'
thusa
timcheall
toigh
toil
tro
tro' # troimh
troimh
troimhe
tron
tu
tusa
uair
ud
ugaibh
ugam-s'
ugam-sa
uice
uige
uige-san
umad
unnta # ann an
ur
urrainn
à
às
àsan
á
ás
è
ì
ò
ó
""".split(
"\n"
)
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
class KurmanjiDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Kurmanji(Language):
lang = "kmr"
Defaults = KurmanjiDefaults
__all__ = ["Kurmanji"]

View File

@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.kmr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
]

138
spacy/lang/kmr/lex_attrs.py Normal file
View File

@ -0,0 +1,138 @@
from ...attrs import LIKE_NUM
_num_words = [
"sifir",
"yek",
"du",
"",
"çar",
"pênc",
"şeş",
"heft",
"heşt",
"neh",
"deh",
"yazde",
"dazde",
"sêzde",
"çarde",
"pazde",
"şazde",
"hevde",
"hejde",
"nozde",
"bîst",
"",
"çil",
"pêncî",
"şêst",
"heftê",
"heştê",
"nod",
"sed",
"hezar",
"milyon",
"milyar",
]
_ordinal_words = [
"yekem",
"yekemîn",
"duyem",
"duyemîn",
"sêyem",
"sêyemîn",
"çarem",
"çaremîn",
"pêncem",
"pêncemîn",
"şeşem",
"şeşemîn",
"heftem",
"heftemîn",
"heştem",
"heştemîn",
"nehem",
"nehemîn",
"dehem",
"dehemîn",
"yazdehem",
"yazdehemîn",
"dazdehem",
"dazdehemîn",
"sêzdehem",
"sêzdehemîn",
"çardehem",
"çardehemîn",
"pazdehem",
"pazdehemîn",
"şanzdehem",
"şanzdehemîn",
"hevdehem",
"hevdehemîn",
"hejdehem",
"hejdehemîn",
"nozdehem",
"nozdehemîn",
"bîstem",
"bîstemîn",
"sîyem",
"sîyemîn",
"çilem",
"çilemîn",
"pêncîyem",
"pênciyemîn",
"şêstem",
"şêstemîn",
"heftêyem",
"heftêyemîn",
"heştêyem",
"heştêyemîn",
"notem",
"notemîn",
"sedem",
"sedemîn",
"hezarem",
"hezaremîn",
"milyonem",
"milyonemîn",
"milyarem",
"milyaremîn",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if is_digit(text_lower):
return True
return False
def is_digit(text):
endings = ("em", "yem", "emîn", "yemîn")
for ending in endings:
to = len(ending)
if text.endswith(ending) and text[:-to].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,44 @@
STOP_WORDS = set(
"""
û
li
bi
di
da
de
ji
ku
ew
ez
tu
em
hûn
ew
ev
min
te
me
we
wan
va
çi
çawa
çima
kengî
li ku
çend
çiqas
her
hin
gelek
hemû
kes
tişt
""".split()
)

View File

@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return MacedonianLemmatizer(lookups)
class Macedonian(Language):
lang = "mk"
Defaults = MacedonianDefaults

View File

@ -5,7 +5,7 @@ import multiprocessing as mp
import random
import traceback
import warnings
from contextlib import contextmanager
from contextlib import ExitStack, contextmanager
from copy import deepcopy
from dataclasses import dataclass
from itertools import chain, cycle
@ -31,6 +31,7 @@ from typing import (
)
import srsly
from cymem.cymem import Pool
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
from . import about, ty, util
@ -2091,6 +2092,38 @@ class Language:
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
tok2vec.remove_listener(listener, pipe_name)
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
Example
-------
>>> with nlp.memory_zone():
... for doc in nlp.pipe(texts):
... process_my_doc(doc)
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
"""
if mem is None:
mem = Pool()
# The ExitStack allows programmatic nested context managers.
# We don't know how many we need, so it would be awkward to have
# them as nested blocks.
with ExitStack() as stack:
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
if hasattr(self.tokenizer, "memory_zone"):
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
for _, pipe in self.pipeline:
if hasattr(pipe, "memory_zone"):
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
yield mem
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:

View File

@ -164,45 +164,48 @@ cdef class Lexeme:
vector = self.vector
return numpy.sqrt((vector**2).sum())
property vector:
@property
def vector(self):
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector):
if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
@vector.setter
def vector(self, vector):
if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
property rank:
@property
def rank(self):
"""RETURNS (str): Sequential ID of the lexeme's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
return self.c.id
def __set__(self, value):
self.c.id = value
@rank.setter
def rank(self, value):
self.c.id = value
property sentiment:
@property
def sentiment(self):
"""RETURNS (float): A scalar value indicating the positivity or
negativity of the lexeme."""
def __get__(self):
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {})
return sentiment_table.get(self.c.orth, 0.0)
def __set__(self, float x):
if "lexeme_sentiment" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table[self.c.orth] = x
@sentiment.setter
def sentiment(self, float x):
if "lexeme_sentiment" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_sentiment")
sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment")
sentiment_table[self.c.orth] = x
@property
def orth_(self):
@ -216,306 +219,338 @@ cdef class Lexeme:
"""RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
property lower:
@property
def lower(self):
"""RETURNS (uint64): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
return self.c.lower
def __set__(self, attr_t x):
self.c.lower = x
@lower.setter
def lower(self, attr_t x):
self.c.lower = x
property norm:
@property
def norm(self):
"""RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
return self.c.norm
return self.c.norm
def __set__(self, attr_t x):
if "lexeme_norm" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
@norm.setter
def norm(self, attr_t x):
if "lexeme_norm" not in self.vocab.lookups:
self.vocab.lookups.add_table("lexeme_norm")
norm_table = self.vocab.lookups.get_table("lexeme_norm")
norm_table[self.c.orth] = self.vocab.strings[x]
self.c.norm = x
property shape:
@property
def shape(self):
"""RETURNS (uint64): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
return self.c.shape
return self.c.shape
def __set__(self, attr_t x):
self.c.shape = x
@shape.setter
def shape(self, attr_t x):
self.c.shape = x
property prefix:
@property
def prefix(self):
"""RETURNS (uint64): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
return self.c.prefix
return self.c.prefix
def __set__(self, attr_t x):
self.c.prefix = x
@prefix.setter
def prefix(self, attr_t x):
self.c.prefix = x
property suffix:
@property
def suffix(self):
"""RETURNS (uint64): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
return self.c.suffix
return self.c.suffix
def __set__(self, attr_t x):
self.c.suffix = x
@suffix.setter
def suffix(self, attr_t x):
self.c.suffix = x
property cluster:
@property
def cluster(self):
"""RETURNS (int): Brown cluster ID."""
def __get__(self):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0)
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
return cluster_table.get(self.c.orth, 0)
def __set__(self, int x):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x
@cluster.setter
def cluster(self, int x):
cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {})
cluster_table[self.c.orth] = x
property lang:
@property
def lang(self):
"""RETURNS (uint64): Language of the parent vocabulary."""
def __get__(self):
return self.c.lang
return self.c.lang
def __set__(self, attr_t x):
self.c.lang = x
@lang.setter
def lang(self, attr_t x):
self.c.lang = x
property prob:
@property
def prob(self):
"""RETURNS (float): Smoothed log probability estimate of the lexeme's
type."""
def __get__(self):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
settings_table = self.vocab.lookups.get_table("lexeme_settings", {})
default_oov_prob = settings_table.get("oov_prob", -20.0)
return prob_table.get(self.c.orth, default_oov_prob)
def __set__(self, float x):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x
@prob.setter
def prob(self, float x):
prob_table = self.vocab.lookups.get_table("lexeme_prob", {})
prob_table[self.c.orth] = x
property lower_:
@property
def lower_(self):
"""RETURNS (str): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
return self.vocab.strings[self.c.lower]
def __set__(self, str x):
self.c.lower = self.vocab.strings.add(x)
@lower_.setter
def lower_(self, str x):
self.c.lower = self.vocab.strings.add(x)
property norm_:
@property
def norm_(self):
"""RETURNS (str): The lexeme's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
return self.vocab.strings[self.c.norm]
return self.vocab.strings[self.c.norm]
def __set__(self, str x):
self.norm = self.vocab.strings.add(x)
@norm_.setter
def norm_(self, str x):
self.norm = self.vocab.strings.add(x)
property shape_:
@property
def shape_(self):
"""RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
return self.vocab.strings[self.c.shape]
return self.vocab.strings[self.c.shape]
def __set__(self, str x):
self.c.shape = self.vocab.strings.add(x)
@shape_.setter
def shape_(self, str x):
self.c.shape = self.vocab.strings.add(x)
property prefix_:
@property
def prefix_(self):
"""RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
return self.vocab.strings[self.c.prefix]
return self.vocab.strings[self.c.prefix]
def __set__(self, str x):
self.c.prefix = self.vocab.strings.add(x)
@prefix_.setter
def prefix_(self, str x):
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
@property
def suffix_(self):
"""RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
return self.vocab.strings[self.c.suffix]
return self.vocab.strings[self.c.suffix]
def __set__(self, str x):
self.c.suffix = self.vocab.strings.add(x)
@suffix_.setter
def suffix_(self, str x):
self.c.suffix = self.vocab.strings.add(x)
property lang_:
@property
def lang_(self):
"""RETURNS (str): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]
return self.vocab.strings[self.c.lang]
def __set__(self, str x):
self.c.lang = self.vocab.strings.add(x)
@lang_.setter
def lang_(self, str x):
self.c.lang = self.vocab.strings.add(x)
property flags:
@property
def flags(self):
"""RETURNS (uint64): Container of the lexeme's binary flags."""
def __get__(self):
return self.c.flags
return self.c.flags
def __set__(self, flags_t x):
self.c.flags = x
@flags.setter
def flags(self, flags_t x):
self.c.flags = x
@property
def is_oov(self):
"""RETURNS (bool): Whether the lexeme is out-of-vocabulary."""
return self.orth not in self.vocab.vectors
property is_stop:
@property
def is_stop(self):
"""RETURNS (bool): Whether the lexeme is a stop word."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_STOP)
return Lexeme.c_check_flag(self.c, IS_STOP)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
@is_stop.setter
def is_stop(self, bint x):
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha:
@property
def is_alpha(self):
"""RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ALPHA)
return Lexeme.c_check_flag(self.c, IS_ALPHA)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
@is_alpha.setter
def is_alpha(self, bint x):
Lexeme.c_set_flag(self.c, IS_ALPHA, x)
property is_ascii:
@property
def is_ascii(self):
"""RETURNS (bool): Whether the lexeme consists of ASCII characters.
Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_ASCII)
return Lexeme.c_check_flag(self.c, IS_ASCII)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
@is_ascii.setter
def is_ascii(self, bint x):
Lexeme.c_set_flag(self.c, IS_ASCII, x)
property is_digit:
@property
def is_digit(self):
"""RETURNS (bool): Whether the lexeme consists of digits. Equivalent
to `lexeme.text.isdigit()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_DIGIT)
return Lexeme.c_check_flag(self.c, IS_DIGIT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
@is_digit.setter
def is_digit(self, bint x):
Lexeme.c_set_flag(self.c, IS_DIGIT, x)
property is_lower:
@property
def is_lower(self):
"""RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to
`lexeme.text.islower()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LOWER)
return Lexeme.c_check_flag(self.c, IS_LOWER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
@is_lower.setter
def is_lower(self, bint x):
Lexeme.c_set_flag(self.c, IS_LOWER, x)
property is_upper:
@property
def is_upper(self):
"""RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to
`lexeme.text.isupper()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_UPPER)
return Lexeme.c_check_flag(self.c, IS_UPPER)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
@is_upper.setter
def is_upper(self, bint x):
Lexeme.c_set_flag(self.c, IS_UPPER, x)
property is_title:
@property
def is_title(self):
"""RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to
`lexeme.text.istitle()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_TITLE)
return Lexeme.c_check_flag(self.c, IS_TITLE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
@is_title.setter
def is_title(self, bint x):
Lexeme.c_set_flag(self.c, IS_TITLE, x)
property is_punct:
@property
def is_punct(self):
"""RETURNS (bool): Whether the lexeme is punctuation."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_PUNCT)
return Lexeme.c_check_flag(self.c, IS_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
@is_punct.setter
def is_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_PUNCT, x)
property is_space:
@property
def is_space(self):
"""RETURNS (bool): Whether the lexeme consist of whitespace characters.
Equivalent to `lexeme.text.isspace()`.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_SPACE)
return Lexeme.c_check_flag(self.c, IS_SPACE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
@is_space.setter
def is_space(self, bint x):
Lexeme.c_set_flag(self.c, IS_SPACE, x)
property is_bracket:
@property
def is_bracket(self):
"""RETURNS (bool): Whether the lexeme is a bracket."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_BRACKET)
return Lexeme.c_check_flag(self.c, IS_BRACKET)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
@is_bracket.setter
def is_bracket(self, bint x):
Lexeme.c_set_flag(self.c, IS_BRACKET, x)
property is_quote:
@property
def is_quote(self):
"""RETURNS (bool): Whether the lexeme is a quotation mark."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_QUOTE)
return Lexeme.c_check_flag(self.c, IS_QUOTE)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
@is_quote.setter
def is_quote(self, bint x):
Lexeme.c_set_flag(self.c, IS_QUOTE, x)
property is_left_punct:
@property
def is_left_punct(self):
"""RETURNS (bool): Whether the lexeme is left punctuation, e.g. (."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
@is_left_punct.setter
def is_left_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
property is_right_punct:
@property
def is_right_punct(self):
"""RETURNS (bool): Whether the lexeme is right punctuation, e.g. )."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
@is_right_punct.setter
def is_right_punct(self, bint x):
Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
property is_currency:
@property
def is_currency(self):
"""RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €."""
def __get__(self):
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
return Lexeme.c_check_flag(self.c, IS_CURRENCY)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
@is_currency.setter
def is_currency(self, bint x):
Lexeme.c_set_flag(self.c, IS_CURRENCY, x)
property like_url:
@property
def like_url(self):
"""RETURNS (bool): Whether the lexeme resembles a URL."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_URL)
return Lexeme.c_check_flag(self.c, LIKE_URL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
@like_url.setter
def like_url(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_URL, x)
property like_num:
@property
def like_num(self):
"""RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9",
"10", "ten", etc.
"""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_NUM)
return Lexeme.c_check_flag(self.c, LIKE_NUM)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
@like_num.setter
def like_num(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_NUM, x)
property like_email:
@property
def like_email(self):
"""RETURNS (bool): Whether the lexeme resembles an email address."""
def __get__(self):
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
return Lexeme.c_check_flag(self.c, LIKE_EMAIL)
def __set__(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)
@like_email.setter
def like_email(self, bint x):
Lexeme.c_set_flag(self.c, LIKE_EMAIL, x)

View File

@ -203,7 +203,7 @@ cdef class ArcEagerGold:
def __init__(self, ArcEager moves, StateClass stcls, Example example):
self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True)
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
sent_starts = _get_aligned_sent_starts(example)
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)

View File

@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label)
doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
set_children_from_heads(doc.c, 0, doc.length)
return doc

View File

@ -11,7 +11,6 @@ from .. import util
from ..errors import Errors
from ..kb import Candidate, KnowledgeBase
from ..language import Language
from ..ml import empty_kb
from ..scorer import Scorer
from ..tokens import Doc, Span
from ..training import Example, validate_examples, validate_get_examples
@ -105,7 +104,7 @@ def make_entity_linker(
): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
scorer (Optional[Callable]): The scoring method.
use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another
use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another
component must provide entity annotations.
candidates_batch_size (int): Size of batches for entity candidate generation.
threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold,
@ -235,7 +234,6 @@ class EntityLinker(TrainablePipe):
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
self.kb = generate_empty_kb(self.vocab, entity_vector_length)
self.scorer = scorer
self.use_gold_ents = use_gold_ents
self.candidates_batch_size = candidates_batch_size
self.threshold = threshold
@ -243,6 +241,37 @@ class EntityLinker(TrainablePipe):
if candidates_batch_size < 1:
raise ValueError(Errors.E1044)
def _score_with_ents_set(examples: Iterable[Example], **kwargs):
# Because of how spaCy works, we can't just score immediately, because Language.evaluate
# calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline.
if not scorer:
return scorer
if not self.use_gold_ents:
return scorer(examples, **kwargs)
else:
examples = self._ensure_ents(examples)
docs = self.pipe(
(eg.predicted for eg in examples),
)
for eg, doc in zip(examples, docs):
eg.predicted = doc
return scorer(examples, **kwargs)
self.scorer = _score_with_ents_set
def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]:
"""If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted."""
if not self.use_gold_ents:
return examples
new_examples = []
for eg in examples:
ents, _ = eg.get_aligned_ents_and_ner()
new_eg = eg.copy()
new_eg.predicted.ents = ents
new_examples.append(new_eg)
return new_examples
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
create it using this object's vocab."""
@ -284,11 +313,9 @@ class EntityLinker(TrainablePipe):
nO = self.kb.entity_vector_length
doc_sample = []
vector_sample = []
for eg in islice(get_examples(), 10):
examples = self._ensure_ents(islice(get_examples(), 10))
for eg in examples:
doc = eg.x
if self.use_gold_ents:
ents, _ = eg.get_aligned_ents_and_ner()
doc.ents = ents
doc_sample.append(doc)
vector_sample.append(self.model.ops.alloc1f(nO))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
@ -354,31 +381,17 @@ class EntityLinker(TrainablePipe):
losses.setdefault(self.name, 0.0)
if not examples:
return losses
examples = self._ensure_ents(examples)
validate_examples(examples, "EntityLinker.update")
set_dropout_rate(self.model, drop)
docs = [eg.predicted for eg in examples]
# save to restore later
old_ents = [doc.ents for doc in docs]
for doc, ex in zip(docs, examples):
if self.use_gold_ents:
ents, _ = ex.get_aligned_ents_and_ner()
doc.ents = ents
else:
# only keep matching ents
doc.ents = ex.get_matching_ents()
# make sure we have something to learn from, if not, short-circuit
if not self.batch_has_learnable_example(examples):
return losses
set_dropout_rate(self.model, drop)
docs = [eg.predicted for eg in examples]
sentence_encodings, bp_context = self.model.begin_update(docs)
# now restore the ents
for doc, old in zip(docs, old_ents):
doc.ents = old
loss, d_scores = self.get_loss(
sentence_encodings=sentence_encodings, examples=examples
)
@ -386,11 +399,13 @@ class EntityLinker(TrainablePipe):
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker.get_loss")
entity_encodings = []
# We assume that get_loss is called with gold ents set in the examples if need be
eidx = 0 # indices in gold entities to keep
keep_ents = [] # indices in sentence_encodings to keep

View File

@ -25,5 +25,7 @@ cdef class StringStore:
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
cdef vector[hash_t] _transient_keys
cdef Pool _non_temp_mem

View File

@ -1,9 +1,14 @@
# cython: infer_types=True
# cython: profile=False
cimport cython
from contextlib import contextmanager
from typing import Iterator, List, Optional
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash32, hash64
from preshed.maps cimport map_clear
import srsly
@ -31,7 +36,7 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
cdef hash_t str_hash
cdef hash_t str_hash
if isinstance(key, str):
if len(key) == 0:
return 0
@ -45,8 +50,8 @@ def get_string_id(key):
elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead).
return str_hash
else:
@ -119,10 +124,11 @@ cdef class StringStore:
strings (iterable): A sequence of unicode strings to add to the store.
"""
self.mem = Pool()
self._non_temp_mem = self.mem
self._map = PreshMap()
if strings is not None:
for string in strings:
self.add(string)
self.add(string, allow_transient=False)
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash, or vice versa.
@ -152,14 +158,17 @@ cdef class StringStore:
return SYMBOLS_BY_INT[str_hash]
else:
utf8str = <Utf8Str*>self._map.get(str_hash)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
else:
# TODO: Raise an error instead
utf8str = <Utf8Str*>self._map.get(string_or_id)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
@ -175,12 +184,46 @@ cdef class StringStore:
else:
return self[key]
def add(self, string):
def __len__(self) -> int:
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.keys.size() + self._transient_keys.size()
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
"""
if mem is None:
mem = Pool()
self.mem = mem
yield mem
for key in self._transient_keys:
map_clear(self._map.c_map, key)
self._transient_keys.clear()
self.mem = self._non_temp_mem
def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
"""Add a string to the StringStore.
string (str): The string to add.
allow_transient (bool): Allow the string to be stored in the 'transient'
map, which will be flushed at the end of the memory zone. Strings
encountered during arbitrary text processing should be added
with allow_transient=True, while labels and other strings used
internally should not.
RETURNS (uint64): The string's hash value.
"""
if allow_transient is None:
allow_transient = self.mem is not self._non_temp_mem
cdef hash_t str_hash
if isinstance(string, str):
if string in SYMBOLS_BY_STR:
@ -188,22 +231,26 @@ cdef class StringStore:
string = string.encode("utf8")
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
self._intern_utf8(string, len(string), &str_hash, allow_transient)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
self._intern_utf8(string, len(string), &str_hash, allow_transient)
else:
raise TypeError(Errors.E017.format(value_type=type(string)))
return str_hash
def __len__(self):
"""The number of strings in the store.
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
else:
return self._intern_str(string, allow_transient)
RETURNS (int): The number of strings in the store.
"""
return self.keys.size()
return self.keys.size() + self._transient_keys.size()
def __contains__(self, string_or_id not None):
"""Check whether a string or ID is in the store.
@ -222,12 +269,17 @@ cdef class StringStore:
pass
else:
# TODO: Raise an error instead
return self._map.get(string_or_id) is not NULL
if self._map.get(string_or_id) is not NULL:
return True
else:
return False
if str_hash < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(str_hash) is not NULL
if self._map.get(str_hash) is not NULL:
return True
else:
return False
def __iter__(self):
"""Iterate over the strings in the store, in order.
@ -240,12 +292,29 @@ cdef class StringStore:
key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here?
for i in range(self._transient_keys.size()):
key = self._transient_keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
def __reduce__(self):
strings = list(self)
return (StringStore, (strings,), None, None, None)
def values(self) -> List[int]:
"""Iterate over the stored strings hashes in insertion order.
RETURNS: A list of string hashs.
"""
cdef int i
hashes = [None] * self._keys.size()
for i in range(self._keys.size()):
hashes[i] = self._keys[i]
transient_hashes = [None] * self._transient_keys.size()
for i in range(self._transient_keys.size()):
transient_hashes[i] = self._transient_keys[i]
return hashes + transient_hashes
def to_disk(self, path):
"""Save the current state to a directory.
@ -269,7 +338,7 @@ cdef class StringStore:
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
self.add(word, allow_transient=False)
return self
def to_bytes(self, **kwargs):
@ -289,23 +358,25 @@ cdef class StringStore:
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
self.add(word, allow_transient=False)
return self
def _reset_and_load(self, strings):
self.mem = Pool()
self._non_temp_mem = self.mem
self._map = PreshMap()
self.keys.clear()
self._transient_keys.clear()
for string in strings:
self.add(string)
self.add(string, allow_transient=False)
cdef const Utf8Str* intern_unicode(self, str py_string):
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string), NULL)
return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
@cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
# TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
@ -314,5 +385,8 @@ cdef class StringStore:
return value
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
self.keys.push_back(key)
if allow_transient and self.mem is not self._non_temp_mem:
self._transient_keys.push_back(key)
else:
self.keys.push_back(key)
return value

View File

@ -81,6 +81,11 @@ def bn_tokenizer():
return get_lang_class("bn")().tokenizer
@pytest.fixture(scope="session")
def bo_tokenizer():
return get_lang_class("bo")().tokenizer
@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class("ca")().tokenizer

View File

View File

@ -0,0 +1,21 @@
import pytest
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("999.0", True),
("གཅིག་", True),
("གཉིས་", True),
("ཀླད་ཀོར་", True),
("བཅུ་གཅིག་", True),
("ཁྱི་", False),
(",", False),
],
)
def test_lex_attrs_like_number(bo_tokenizer, text, match):
tokens = bo_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match

View File

View File

@ -0,0 +1,27 @@
import pytest
from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
"word",
[
"yekem",
"duyemîn",
"100em",
"dehem",
"sedemîn",
"34em",
"30yem",
"20emîn",
"50yemîn",
],
)
def test_kmr_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)
@pytest.mark.parametrize("word", ["deh"])
def test_kmr_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"]
"tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
# fmt: on

View File

@ -717,7 +717,7 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"]
# fmt: on
def test_overfitting_IO():
def test_overfitting_IO_gold_entities():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English()
vector_length = 3
@ -744,7 +744,9 @@ def test_overfitting_IO():
return mykb
# Create the Entity Linker component and add it to the pipeline
entity_linker = nlp.add_pipe("entity_linker", last=True)
entity_linker = nlp.add_pipe(
"entity_linker", last=True, config={"use_gold_ents": True}
)
assert isinstance(entity_linker, EntityLinker)
entity_linker.set_kb(create_kb)
assert "Q2146908" in entity_linker.vocab.strings
@ -807,6 +809,107 @@ def test_overfitting_IO():
assert_equal(batch_deps_1, batch_deps_2)
assert_equal(batch_deps_1, no_batch_deps)
eval = nlp.evaluate(train_examples)
assert "nel_macro_p" in eval
assert "nel_macro_r" in eval
assert "nel_macro_f" in eval
assert "nel_micro_p" in eval
assert "nel_micro_r" in eval
assert "nel_micro_f" in eval
assert "nel_f_per_type" in eval
assert "PERSON" in eval["nel_f_per_type"]
assert eval["nel_macro_f"] > 0
assert eval["nel_micro_f"] > 0
def test_overfitting_IO_with_ner():
# Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly
nlp = English()
vector_length = 3
assert "Q2146908" not in nlp.vocab.strings
# Convert the texts to docs to make sure we have doc.ents set for the training examples
train_examples = []
for text, annotation in TRAIN_DATA:
doc = nlp(text)
train_examples.append(Example.from_dict(doc, annotation))
def create_kb(vocab):
# create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher
mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias(
alias="Russ Cochran",
entities=["Q2146908", "Q7381115"],
probabilities=[0.5, 0.5],
)
return mykb
# Create the NER and EL components and add them to the pipeline
ner = nlp.add_pipe("ner", first=True)
entity_linker = nlp.add_pipe(
"entity_linker", last=True, config={"use_gold_ents": False}
)
entity_linker.set_kb(create_kb)
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
ner.add_label(ent[2])
optimizer = nlp.initialize()
# train the NER and NEL pipes
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.001
assert losses["entity_linker"] < 0.001
# adding additional components that are required for the entity_linker
nlp.add_pipe("sentencizer", first=True)
# test the trained model
test_text = "Russ Cochran captured his first major title with his son as caddie."
doc = nlp(test_text)
ents = doc.ents
assert len(ents) == 1
assert ents[0].text == "Russ Cochran"
assert ents[0].label_ == "PERSON"
assert ents[0].kb_id_ != "NIL"
# TODO: below assert is still flaky - EL doesn't properly overfit quite yet
# assert ents[0].kb_id_ == "Q2146908"
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
assert nlp2.pipe_names == nlp.pipe_names
doc2 = nlp2(test_text)
ents2 = doc2.ents
assert len(ents2) == 1
assert ents2[0].text == "Russ Cochran"
assert ents2[0].label_ == "PERSON"
assert ents2[0].kb_id_ != "NIL"
eval = nlp.evaluate(train_examples)
assert "nel_macro_f" in eval
assert "nel_micro_f" in eval
assert "ents_f" in eval
assert "nel_f_per_type" in eval
assert "ents_per_type" in eval
assert "PERSON" in eval["nel_f_per_type"]
assert "PERSON" in eval["ents_per_type"]
assert eval["nel_macro_f"] > 0
assert eval["nel_micro_f"] > 0
assert eval["ents_f"] > 0
def test_kb_serialization():
# Test that the KB can be used in a pipeline with a different vocab

View File

@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process):
nlp.set_error_handler(raise_error)
with pytest.raises(ValueError):
list(nlp.pipe(texts, n_process=n_process))
# set explicitely to ignoring
# set explicitly to ignoring
nlp.set_error_handler(ignore_error)
docs = list(nlp.pipe(texts, n_process=n_process))
assert len(docs) == 0

View File

@ -18,6 +18,7 @@ LANGUAGES = [
pytest.param("ar", marks=pytest.mark.slow()),
pytest.param("bg", marks=pytest.mark.slow()),
"bn",
pytest.param("bo", marks=pytest.mark.slow()),
pytest.param("ca", marks=pytest.mark.slow()),
pytest.param("cs", marks=pytest.mark.slow()),
pytest.param("da", marks=pytest.mark.slow()),
@ -57,6 +58,7 @@ LANGUAGES = [
pytest.param("tr", marks=pytest.mark.slow()),
pytest.param("tt", marks=pytest.mark.slow()),
pytest.param("ur", marks=pytest.mark.slow()),
pytest.param("kmr", marks=pytest.mark.slow()),
]

View File

@ -0,0 +1,36 @@
from spacy.vocab import Vocab
def test_memory_zone_no_insertion():
vocab = Vocab()
with vocab.memory_zone():
pass
lex = vocab["horse"]
assert lex.text == "horse"
def test_memory_zone_insertion():
vocab = Vocab()
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab
with vocab.memory_zone():
lex = vocab["horse"]
assert lex.text == "horse"
assert "dog" in vocab
assert "horse" not in vocab
def test_memory_zone_redundant_insertion():
"""Test that if we insert an already-existing word while
in the memory zone, it stays persistent"""
vocab = Vocab()
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab
with vocab.memory_zone():
lex = vocab["horse"]
assert lex.text == "horse"
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab

View File

@ -25,9 +25,7 @@ cdef class Tokenizer:
cdef PhraseMatcher _special_matcher
# TODO convert to bool in v4
cdef int _faster_heuristics
# TODO next one is unused and should be removed in v4
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int2
cdef public int max_cache_size
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1

View File

@ -30,7 +30,7 @@ cdef class Tokenizer:
"""
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None,
url_match=None, faster_heuristics=True):
url_match=None, faster_heuristics=True, max_cache_size=10000):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
faster_heuristics (bool): Whether to restrict the final
Matcher-based pass for rules to those containing affixes or space.
Defaults to True.
max_cache_size (int): Maximum number of tokenization chunks to cache.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
@ -69,66 +70,74 @@ cdef class Tokenizer:
self._rules = {}
self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules)
self.max_cache_size = max_cache_size
property token_match:
def __get__(self):
return self._token_match
@property
def token_match(self):
return self._token_match
def __set__(self, token_match):
self._token_match = token_match
self._reload_special_cases()
@token_match.setter
def token_match(self, token_match):
self._token_match = token_match
self._reload_special_cases()
property url_match:
def __get__(self):
return self._url_match
@property
def url_match(self):
return self._url_match
def __set__(self, url_match):
self._url_match = url_match
self._reload_special_cases()
@url_match.setter
def url_match(self, url_match):
self._url_match = url_match
self._reload_special_cases()
property prefix_search:
def __get__(self):
return self._prefix_search
@property
def prefix_search(self):
return self._prefix_search
def __set__(self, prefix_search):
self._prefix_search = prefix_search
self._reload_special_cases()
@prefix_search.setter
def prefix_search(self, prefix_search):
self._prefix_search = prefix_search
self._reload_special_cases()
property suffix_search:
def __get__(self):
return self._suffix_search
@property
def suffix_search(self):
return self._suffix_search
def __set__(self, suffix_search):
self._suffix_search = suffix_search
self._reload_special_cases()
@suffix_search.setter
def suffix_search(self, suffix_search):
self._suffix_search = suffix_search
self._reload_special_cases()
property infix_finditer:
def __get__(self):
return self._infix_finditer
@property
def infix_finditer(self):
return self._infix_finditer
def __set__(self, infix_finditer):
self._infix_finditer = infix_finditer
self._reload_special_cases()
@infix_finditer.setter
def infix_finditer(self, infix_finditer):
self._infix_finditer = infix_finditer
self._reload_special_cases()
property rules:
def __get__(self):
return self._rules
@property
def rules(self):
return self._rules
def __set__(self, rules):
self._rules = {}
self._flush_cache()
self._flush_specials()
self._cache = PreshMap()
self._specials = PreshMap()
self._load_special_cases(rules)
@rules.setter
def rules(self, rules):
self._rules = {}
self._flush_cache()
self._flush_specials()
self._cache = PreshMap()
self._specials = PreshMap()
self._load_special_cases(rules)
property faster_heuristics:
def __get__(self):
return bool(self._faster_heuristics)
@property
def faster_heuristics(self):
return bool(self._faster_heuristics)
def __set__(self, faster_heuristics):
self._faster_heuristics = bool(faster_heuristics)
self._reload_special_cases()
@faster_heuristics.setter
def faster_heuristics(self, faster_heuristics):
self._faster_heuristics = bool(faster_heuristics)
self._reload_special_cases()
def __reduce__(self):
args = (self.vocab,
@ -390,8 +399,9 @@ cdef class Tokenizer:
has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
if len(self._cache) < self.max_cache_size:
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef str _split_affixes(
self,
@ -507,9 +517,8 @@ cdef class Tokenizer:
if n <= 0:
# avoid mem alloc of zero length
return 0
for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0
if self.vocab.in_memory_zone:
return 0
# See #1250
if has_special[0]:
return 0

View File

@ -667,7 +667,8 @@ cdef class Doc:
else:
return False
property vector:
@property
def vector(self):
"""A real-valued meaning representation. Defaults to an average of the
token vectors.
@ -676,48 +677,49 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#vector
"""
def __get__(self):
if "vector" in self.user_hooks:
return self.user_hooks["vector"](self)
if self._vector is not None:
return self._vector
xp = get_array_module(self.vocab.vectors.data)
if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector
elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self)
return self._vector
elif self.tensor.size > 0:
self._vector = self.tensor.mean(axis=0)
return self._vector
else:
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
if "vector" in self.user_hooks:
return self.user_hooks["vector"](self)
if self._vector is not None:
return self._vector
xp = get_array_module(self.vocab.vectors.data)
if not len(self):
self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f")
return self._vector
elif self.vocab.vectors.size > 0:
self._vector = sum(t.vector for t in self) / len(self)
return self._vector
elif self.tensor.size > 0:
self._vector = self.tensor.mean(axis=0)
return self._vector
else:
return xp.zeros((self.vocab.vectors_length,), dtype="float32")
def __set__(self, value):
self._vector = value
@vector.setter
def vector(self, value):
self._vector = value
property vector_norm:
@property
def vector_norm(self):
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
DOCS: https://spacy.io/api/doc#vector_norm
"""
def __get__(self):
if "vector_norm" in self.user_hooks:
return self.user_hooks["vector_norm"](self)
cdef float value
cdef double norm = 0
if self._vector_norm is None:
norm = 0.0
for value in self.vector:
norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm
if "vector_norm" in self.user_hooks:
return self.user_hooks["vector_norm"](self)
cdef float value
cdef double norm = 0
if self._vector_norm is None:
norm = 0.0
for value in self.vector:
norm += value * value
self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm
def __set__(self, value):
self._vector_norm = value
@vector_norm.setter
def vector_norm(self, value):
self._vector_norm = value
@property
def text(self):
@ -736,7 +738,8 @@ cdef class Doc:
"""
return self.text
property ents:
@property
def ents(self):
"""The named entities in the document. Returns a tuple of named entity
`Span` objects, if the entity recognizer has been applied.
@ -744,55 +747,55 @@ cdef class Doc:
DOCS: https://spacy.io/api/doc#ents
"""
def __get__(self):
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef attr_t label = 0
cdef attr_t kb_id = 0
cdef attr_t ent_id = 0
output = []
for i in range(self.length):
token = &self.c[i]
if token.ent_iob == 1:
if start == -1:
seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
elif token.ent_iob == 2 or token.ent_iob == 0 or \
(token.ent_iob == 3 and token.ent_type == 0):
if start != -1:
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
start = -1
label = 0
kb_id = 0
ent_id = 0
elif token.ent_iob == 3:
if start != -1:
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
start = i
label = token.ent_type
kb_id = token.ent_kb_id
ent_id = token.ent_id
if start != -1:
output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id))
# remove empty-label spans
output = [o for o in output if o.label_ != ""]
return tuple(output)
cdef int i
cdef const TokenC* token
cdef int start = -1
cdef attr_t label = 0
cdef attr_t kb_id = 0
cdef attr_t ent_id = 0
output = []
for i in range(self.length):
token = &self.c[i]
if token.ent_iob == 1:
if start == -1:
seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]]
raise ValueError(Errors.E093.format(seq=" ".join(seq)))
elif token.ent_iob == 2 or token.ent_iob == 0 or \
(token.ent_iob == 3 and token.ent_type == 0):
if start != -1:
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
start = -1
label = 0
kb_id = 0
ent_id = 0
elif token.ent_iob == 3:
if start != -1:
output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id))
start = i
label = token.ent_type
kb_id = token.ent_kb_id
ent_id = token.ent_id
if start != -1:
output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id))
# remove empty-label spans
output = [o for o in output if o.label_ != ""]
return tuple(output)
def __set__(self, ents):
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
cdef attr_t kb_id, ent_id
cdef int ent_start, ent_end
ent_spans = []
for ent_info in ents:
entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info)
if isinstance(entity_type_, str):
self.vocab.strings.add(entity_type_)
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id)
ent_spans.append(span)
self.set_ents(ent_spans, default=SetEntsDefault.outside)
@ents.setter
def ents(self, ents):
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
cdef attr_t kb_id, ent_id
cdef int ent_start, ent_end
ent_spans = []
for ent_info in ents:
entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info)
if isinstance(entity_type_, str):
self.vocab.strings.add(entity_type_)
span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id)
ent_spans.append(span)
self.set_ents(ent_spans, default=SetEntsDefault.outside)
def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
"""Set entity annotation.

View File

@ -757,78 +757,87 @@ cdef class Span:
for word in self.rights:
yield from word.subtree
property start:
def __get__(self):
return self.c.start
@property
def start(self):
return self.c.start
def __set__(self, int start):
if start < 0:
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
self.c.start = start
@start.setter
def start(self, int start):
if start < 0:
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
self.c.start = start
property end:
def __get__(self):
return self.c.end
@property
def end(self):
return self.c.end
def __set__(self, int end):
if end < 0:
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
self.c.end = end
@end.setter
def end(self, int end):
if end < 0:
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
self.c.end = end
property start_char:
def __get__(self):
return self.c.start_char
@property
def start_char(self):
return self.c.start_char
def __set__(self, int start_char):
if start_char < 0:
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
self.c.start_char = start_char
@start_char.setter
def start_char(self, int start_char):
if start_char < 0:
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
self.c.start_char = start_char
property end_char:
def __get__(self):
return self.c.end_char
@property
def end_char(self):
return self.c.end_char
def __set__(self, int end_char):
if end_char < 0:
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
self.c.end_char = end_char
@end_char.setter
def end_char(self, int end_char):
if end_char < 0:
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
self.c.end_char = end_char
property label:
def __get__(self):
return self.c.label
@property
def label(self):
return self.c.label
def __set__(self, attr_t label):
self.c.label = label
@label.setter
def label(self, attr_t label):
self.c.label = label
property kb_id:
def __get__(self):
return self.c.kb_id
@property
def kb_id(self):
return self.c.kb_id
def __set__(self, attr_t kb_id):
self.c.kb_id = kb_id
@kb_id.setter
def kb_id(self, attr_t kb_id):
self.c.kb_id = kb_id
property id:
def __get__(self):
return self.c.id
@property
def id(self):
return self.c.id
def __set__(self, attr_t id):
self.c.id = id
@id.setter
def id(self, attr_t id):
self.c.id = id
property ent_id:
@property
def ent_id(self):
"""RETURNS (uint64): The entity ID."""
def __get__(self):
return self.root.ent_id
return self.root.ent_id
def __set__(self, hash_t key):
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
@ent_id.setter
def ent_id(self, hash_t key):
raise NotImplementedError(Errors.E200.format(attr="ent_id"))
property ent_id_:
@property
def ent_id_(self):
"""RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
return self.root.ent_id_
def __set__(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
@ent_id_.setter
def ent_id_(self, str key):
raise NotImplementedError(Errors.E200.format(attr="ent_id_"))
@property
def orth_(self):
@ -843,29 +852,32 @@ cdef class Span:
"""RETURNS (str): The span's lemma."""
return "".join([t.lemma_ + t.whitespace_ for t in self]).strip()
property label_:
@property
def label_(self):
"""RETURNS (str): The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
return self.doc.vocab.strings[self.label]
def __set__(self, str label_):
self.label = self.doc.vocab.strings.add(label_)
@label_.setter
def label_(self, str label_):
self.label = self.doc.vocab.strings.add(label_)
property kb_id_:
@property
def kb_id_(self):
"""RETURNS (str): The span's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
return self.doc.vocab.strings[self.kb_id]
def __set__(self, str kb_id_):
self.kb_id = self.doc.vocab.strings.add(kb_id_)
@kb_id_.setter
def kb_id_(self, str kb_id_):
self.kb_id = self.doc.vocab.strings.add(kb_id_)
property id_:
@property
def id_(self):
"""RETURNS (str): The span's ID."""
def __get__(self):
return self.doc.vocab.strings[self.id]
return self.doc.vocab.strings[self.id]
def __set__(self, str id_):
self.id = self.doc.vocab.strings.add(id_)
@id_.setter
def id_(self, str id_):
self.id = self.doc.vocab.strings.add(id_)
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:

View File

@ -249,15 +249,16 @@ cdef class Token:
"""
return not self.c.morph == 0
property morph:
def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
@property
def morph(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
def __set__(self, MorphAnalysis morph):
# Check that the morph has the same vocab
if self.vocab != morph.vocab:
raise ValueError(Errors.E1013)
self.c.morph = morph.c.key
@morph.setter
def morph(self, MorphAnalysis morph):
# Check that the morph has the same vocab
if self.vocab != morph.vocab:
raise ValueError(Errors.E1013)
self.c.morph = morph.c.key
def set_morph(self, features):
cdef hash_t key
@ -377,39 +378,43 @@ cdef class Token:
"""
return self.c.lex.suffix
property lemma:
@property
def lemma(self):
"""RETURNS (uint64): ID of the base form of the word, with no
inflectional suffixes.
"""
def __get__(self):
return self.c.lemma
return self.c.lemma
def __set__(self, attr_t lemma):
self.c.lemma = lemma
@lemma.setter
def lemma(self, attr_t lemma):
self.c.lemma = lemma
property pos:
@property
def pos(self):
"""RETURNS (uint64): ID of coarse-grained part-of-speech tag."""
def __get__(self):
return self.c.pos
return self.c.pos
def __set__(self, pos):
self.c.pos = pos
@pos.setter
def pos(self, pos):
self.c.pos = pos
property tag:
@property
def tag(self):
"""RETURNS (uint64): ID of fine-grained part-of-speech tag."""
def __get__(self):
return self.c.tag
return self.c.tag
def __set__(self, attr_t tag):
self.c.tag = tag
@tag.setter
def tag(self, attr_t tag):
self.c.tag = tag
property dep:
@property
def dep(self):
"""RETURNS (uint64): ID of syntactic dependency label."""
def __get__(self):
return self.c.dep
return self.c.dep
def __set__(self, attr_t label):
self.c.dep = label
@dep.setter
def dep(self, attr_t label):
self.c.dep = label
@property
def has_vector(self):
@ -494,48 +499,51 @@ cdef class Token:
return self.doc.user_token_hooks["sent"](self)
return self.doc[self.i : self.i+1].sent
property sent_start:
def __get__(self):
"""Deprecated: use Token.is_sent_start instead."""
# Raising a deprecation warning here causes errors for autocomplete
# Handle broken backwards compatibility case: doc[0].sent_start
# was False.
if self.i == 0:
return False
else:
return self.c.sent_start
@property
def sent_start(self):
"""Deprecated: use Token.is_sent_start instead."""
# Raising a deprecation warning here causes errors for autocomplete
# Handle broken backwards compatibility case: doc[0].sent_start
# was False.
if self.i == 0:
return False
else:
return self.c.sent_start
def __set__(self, value):
self.is_sent_start = value
@sent_start.setter
def sent_start(self, value):
self.is_sent_start = value
property is_sent_start:
@property
def is_sent_start(self):
"""A boolean value indicating whether the token starts a sentence.
`None` if unknown. Defaults to `True` for the first token in the `Doc`.
RETURNS (bool / None): Whether the token starts a sentence.
None if unknown.
"""
def __get__(self):
if self.c.sent_start == 0:
return None
elif self.c.sent_start < 0:
return False
else:
return True
if self.c.sent_start == 0:
return None
elif self.c.sent_start < 0:
return False
else:
return True
def __set__(self, value):
if self.doc.has_annotation("DEP"):
raise ValueError(Errors.E043)
if value is None:
self.c.sent_start = 0
elif value is True:
self.c.sent_start = 1
elif value is False:
self.c.sent_start = -1
else:
raise ValueError(Errors.E044.format(value=value))
@is_sent_start.setter
def is_sent_start(self, value):
if self.doc.has_annotation("DEP"):
raise ValueError(Errors.E043)
if value is None:
self.c.sent_start = 0
elif value is True:
self.c.sent_start = 1
elif value is False:
self.c.sent_start = -1
else:
raise ValueError(Errors.E044.format(value=value))
property is_sent_end:
@property
def is_sent_end(self):
"""A boolean value indicating whether the token ends a sentence.
`None` if unknown. Defaults to `True` for the last token in the `Doc`.
@ -544,18 +552,18 @@ cdef class Token:
DOCS: https://spacy.io/api/token#is_sent_end
"""
def __get__(self):
if self.i + 1 == len(self.doc):
return True
elif self.doc[self.i+1].is_sent_start is None:
return None
elif self.doc[self.i+1].is_sent_start is True:
return True
else:
return False
if self.i + 1 == len(self.doc):
return True
elif self.doc[self.i+1].is_sent_start is None:
return None
elif self.doc[self.i+1].is_sent_start is True:
return True
else:
return False
def __set__(self, value):
raise ValueError(Errors.E196)
@is_sent_end.setter
def is_sent_end(self, value):
raise ValueError(Errors.E196)
@property
def lefts(self):
@ -682,41 +690,42 @@ cdef class Token:
"""
return not Token.missing_head(self.c)
property head:
@property
def head(self):
"""The syntactic parent, or "governor", of this token.
If token.has_head() is `False`, this method will return itself.
RETURNS (Token): The token predicted by the parser to be the head of
the current token.
"""
def __get__(self):
if not self.has_head():
return self
else:
return self.doc[self.i + self.c.head]
if not self.has_head():
return self
else:
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head):
# This function sets the head of self to new_head and updates the
# counters for left/right dependents and left/right corner for the
# new and the old head
# Check that token is from the same document
if self.doc != new_head.doc:
raise ValueError(Errors.E191)
# Do nothing if old head is new head
if self.i + self.c.head == new_head.i:
return
# Find the widest l/r_edges of the roots of the two tokens involved
# to limit the number of tokens for set_children_from_heads
cdef Token self_root, new_head_root
self_root = ([self] + list(self.ancestors))[-1]
new_head_ancestors = list(new_head.ancestors)
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
# Set new head
self.c.head = new_head.i - self.i
# Adjust parse properties and sentence starts
set_children_from_heads(self.doc.c, start, end + 1)
@head.setter
def head(self, Token new_head):
# This function sets the head of self to new_head and updates the
# counters for left/right dependents and left/right corner for the
# new and the old head
# Check that token is from the same document
if self.doc != new_head.doc:
raise ValueError(Errors.E191)
# Do nothing if old head is new head
if self.i + self.c.head == new_head.i:
return
# Find the widest l/r_edges of the roots of the two tokens involved
# to limit the number of tokens for set_children_from_heads
cdef Token self_root, new_head_root
self_root = ([self] + list(self.ancestors))[-1]
new_head_ancestors = list(new_head.ancestors)
new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head
start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge
end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge
# Set new head
self.c.head = new_head.i - self.i
# Adjust parse properties and sentence starts
set_children_from_heads(self.doc.c, start, end + 1)
@property
def conjuncts(self):
@ -744,21 +753,23 @@ cdef class Token:
queue.append(child)
return tuple([w for w in output if w.i != self.i])
property ent_type:
@property
def ent_type(self):
"""RETURNS (uint64): Named entity type."""
def __get__(self):
return self.c.ent_type
return self.c.ent_type
def __set__(self, ent_type):
self.c.ent_type = ent_type
@ent_type.setter
def ent_type(self, ent_type):
self.c.ent_type = ent_type
property ent_type_:
@property
def ent_type_(self):
"""RETURNS (str): Named entity type."""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
return self.vocab.strings[self.c.ent_type]
def __set__(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type)
@ent_type_.setter
def ent_type_(self, ent_type):
self.c.ent_type = self.vocab.strings.add(ent_type)
@property
def ent_iob(self):
@ -784,41 +795,45 @@ cdef class Token:
"""
return self.iob_strings()[self.c.ent_iob]
property ent_id:
@property
def ent_id(self):
"""RETURNS (uint64): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
return self.c.ent_id
return self.c.ent_id
def __set__(self, hash_t key):
self.c.ent_id = key
@ent_id.setter
def ent_id(self, hash_t key):
self.c.ent_id = key
property ent_id_:
@property
def ent_id_(self):
"""RETURNS (str): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
return self.vocab.strings[self.c.ent_id]
return self.vocab.strings[self.c.ent_id]
def __set__(self, name):
self.c.ent_id = self.vocab.strings.add(name)
@ent_id_.setter
def ent_id_(self, name):
self.c.ent_id = self.vocab.strings.add(name)
property ent_kb_id:
@property
def ent_kb_id(self):
"""RETURNS (uint64): Named entity KB ID."""
def __get__(self):
return self.c.ent_kb_id
return self.c.ent_kb_id
def __set__(self, attr_t ent_kb_id):
self.c.ent_kb_id = ent_kb_id
@ent_kb_id.setter
def ent_kb_id(self, attr_t ent_kb_id):
self.c.ent_kb_id = ent_kb_id
property ent_kb_id_:
@property
def ent_kb_id_(self):
"""RETURNS (str): Named entity KB ID."""
def __get__(self):
return self.vocab.strings[self.c.ent_kb_id]
return self.vocab.strings[self.c.ent_kb_id]
def __set__(self, ent_kb_id):
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
@ent_kb_id_.setter
def ent_kb_id_(self, ent_kb_id):
self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
@property
def whitespace_(self):
@ -840,16 +855,17 @@ cdef class Token:
"""
return self.vocab.strings[self.c.lex.lower]
property norm_:
@property
def norm_(self):
"""RETURNS (str): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
"""
def __get__(self):
return self.vocab.strings[self.norm]
return self.vocab.strings[self.norm]
def __set__(self, str norm_):
self.c.norm = self.vocab.strings.add(norm_)
@norm_.setter
def norm_(self, str norm_):
self.c.norm = self.vocab.strings.add(norm_)
@property
def shape_(self):
@ -879,33 +895,36 @@ cdef class Token:
"""
return self.vocab.strings[self.c.lex.lang]
property lemma_:
@property
def lemma_(self):
"""RETURNS (str): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
"""
def __get__(self):
return self.vocab.strings[self.c.lemma]
return self.vocab.strings[self.c.lemma]
def __set__(self, str lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
@lemma_.setter
def lemma_(self, str lemma_):
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
@property
def pos_(self):
"""RETURNS (str): Coarse-grained part-of-speech tag."""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
return parts_of_speech.NAMES[self.c.pos]
def __set__(self, pos_name):
if pos_name not in parts_of_speech.IDS:
raise ValueError(Errors.E1021.format(pp=pos_name))
self.c.pos = parts_of_speech.IDS[pos_name]
@pos_.setter
def pos_(self, pos_name):
if pos_name not in parts_of_speech.IDS:
raise ValueError(Errors.E1021.format(pp=pos_name))
self.c.pos = parts_of_speech.IDS[pos_name]
property tag_:
@property
def tag_(self):
"""RETURNS (str): Fine-grained part-of-speech tag."""
def __get__(self):
return self.vocab.strings[self.c.tag]
return self.vocab.strings[self.c.tag]
def __set__(self, tag):
self.tag = self.vocab.strings.add(tag)
@tag_.setter
def tag_(self, tag):
self.tag = self.vocab.strings.add(tag)
def has_dep(self):
"""Check whether the token has annotated dep information.
@ -915,13 +934,14 @@ cdef class Token:
"""
return not Token.missing_dep(self.c)
property dep_:
@property
def dep_(self):
"""RETURNS (str): The syntactic dependency label."""
def __get__(self):
return self.vocab.strings[self.c.dep]
return self.vocab.strings[self.c.dep]
def __set__(self, str label):
self.c.dep = self.vocab.strings.add(label)
@dep_.setter
def dep_(self, str label):
self.c.dep = self.vocab.strings.add(label)
@property
def is_oov(self):

View File

@ -88,23 +88,25 @@ cdef class Example:
def __len__(self):
return len(self.predicted)
property predicted:
def __get__(self):
return self.x
@property
def predicted(self):
return self.x
def __set__(self, doc):
self.x = doc
self._cached_alignment = None
self._cached_words_x = [t.text for t in doc]
@predicted.setter
def predicted(self, doc):
self.x = doc
self._cached_alignment = None
self._cached_words_x = [t.text for t in doc]
property reference:
def __get__(self):
return self.y
@property
def reference(self):
return self.y
def __set__(self, doc):
self.y = doc
self._cached_alignment = None
self._cached_words_y = [t.text for t in doc]
@reference.setter
def reference(self, doc):
self.y = doc
self._cached_alignment = None
self._cached_words_y = [t.text for t in doc]
def copy(self):
return Example(
@ -420,9 +422,9 @@ cdef class Example:
seen_indices.update(indices)
return output
property text:
def __get__(self):
return self.x.text
@property
def text(self):
return self.x.text
def __str__(self):
return str(self.to_dict())

View File

@ -41,7 +41,9 @@ cdef class Vocab:
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth
cdef Pool _non_temp_mem
cdef vector[attr_t] _transient_orths

View File

@ -1,6 +1,8 @@
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
from cymem.cymem import Pool
from thinc.types import Floats1d, FloatsXd
from . import Language
@ -67,6 +69,8 @@ class Vocab:
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
) -> Vocab: ...
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
def pickle_vocab(vocab: Vocab) -> Any: ...
def unpickle_vocab(

View File

@ -1,8 +1,11 @@
import functools
from contextlib import ExitStack, contextmanager
from typing import Iterator, Optional
import numpy
import srsly
from thinc.api import get_array_module, get_current_ops
from preshed.maps cimport map_clear
from .attrs cimport LANG, ORTH
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,17 +90,24 @@ cdef class Vocab:
self.lookups = lookups
self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks
# During a memory_zone we replace our mem object with one
# that's passed to us. We keep a reference to our non-temporary
# memory here, in case we need to make an allocation we want to
# guarantee is not temporary. This is also how we check whether
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
self._non_temp_mem = self.mem
property vectors:
def __get__(self):
return self._vectors
@property
def vectors(self):
return self._vectors
def __set__(self, vectors):
if hasattr(vectors, "strings"):
for s in vectors.strings:
self.strings.add(s)
self._vectors = vectors
self._vectors.strings = self.strings
@vectors.setter
def vectors(self, vectors):
if hasattr(vectors, "strings"):
for s in vectors.strings:
self.strings.add(s, allow_transient=False)
self._vectors = vectors
self._vectors.strings = self.strings
@property
def lang(self):
@ -106,6 +116,10 @@ cdef class Vocab:
langfunc = self.lex_attr_getters.get(LANG, None)
return langfunc("_") if langfunc else ""
@property
def in_memory_zone(self) -> bool:
return self.mem is not self._non_temp_mem
def __len__(self):
"""The current number of lexemes stored.
@ -113,6 +127,33 @@ cdef class Vocab:
"""
return self.length
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
"""Begin a block where resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
"""
if mem is None:
mem = Pool()
# The ExitStack allows programmatic nested context managers.
# We don't know how many we need, so it would be awkward to have
# them as nested blocks.
with ExitStack() as stack:
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
if hasattr(self.morphology, "memory_zone"):
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
if hasattr(self._vectors, "memory_zone"):
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
self.mem = mem
yield mem
self._clear_transient_orths()
self.mem = self._non_temp_mem
def add_flag(self, flag_getter, int flag_id=-1):
"""Set a new boolean flag to words in the vocabulary.
@ -147,8 +188,7 @@ cdef class Vocab:
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
`Lexeme` if necessary.
"""
if string == "":
return &EMPTY_LEXEME
@ -179,19 +219,11 @@ cdef class Vocab:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
# The mem argument is deprecated, replaced by memory zones. Same with
# this size heuristic.
mem = self.mem
# if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.orth = self.strings.add(string, allow_transient=True)
lex.length = len(string)
if self.vectors is not None and hasattr(self.vectors, "key2row"):
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -201,18 +233,25 @@ cdef class Vocab:
for attr, func in self.lex_attr_getters.items():
value = func(string)
if isinstance(value, str):
value = self.strings.add(value)
value = self.strings.add(value, allow_transient=True)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex)
self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
self._by_orth.set(lex.orth, <void*>lex)
self.length += 1
if is_transient and self.in_memory_zone:
self._transient_orths.push_back(lex.orth)
def _clear_transient_orths(self):
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
for orth in self._transient_orths:
map_clear(self._by_orth.c_map, orth)
self._transient_orths.clear()
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
@ -264,7 +303,7 @@ cdef class Vocab:
"""
cdef attr_t orth
if isinstance(id_or_string, str):
orth = self.strings.add(id_or_string)
orth = self.strings.add(id_or_string, allow_transient=True)
else:
orth = id_or_string
return Lexeme(self, orth)
@ -416,7 +455,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#get_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
orth = self.strings.add(orth, allow_transient=True)
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.has_vector(key):
@ -435,7 +474,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#set_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
orth = self.strings.add(orth, allow_transient=False)
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.vectors.is_full and key not in self.vectors:
@ -459,22 +498,23 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#has_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
orth = self.strings.add(orth, allow_transient=True)
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
return key in self.vectors
property lookups:
def __get__(self):
return self._lookups
@property
def lookups(self):
return self._lookups
def __set__(self, lookups):
self._lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
self.lookups.get_table("lexeme_norm"),
)
@lookups.setter
def lookups(self, lookups):
self._lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
self.lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.

View File

@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed
as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
appending `_` as in `token.dep_`.
| Attribute | Description |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `DEP` | The token's dependency label. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ |
| `IS_DIGIT` | Token text consists of digits. ~~bool~~ |
| `IS_LOWER` | Token text is in lowercase. ~~bool~~ |
| `IS_PUNCT` | Token is punctuation. ~~bool~~ |
| `IS_SPACE` | Token is whitespace. ~~bool~~ |
| `IS_STOP` | Token is a stop word. ~~bool~~ |
| `IS_TITLE` | Token text is in titlecase. ~~bool~~ |
| `IS_UPPER` | Token text is in uppercase. ~~bool~~ |
| `LEMMA` | The token's lemma. ~~str~~ |
| `LENGTH` | The length of the token text. ~~int~~ |
| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ |
| `LIKE_NUM` | Token text resembles a number. ~~bool~~ |
| `LIKE_URL` | Token text resembles a URL. ~~bool~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ |
| `NORM` | The normalized form of the token text. ~~str~~ |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `POS` | The token's universal part of speech (UPOS). ~~str~~ |
| `SENT_START` | Token is start of sentence. ~~bool~~ |
| `SHAPE` | The token's shape. ~~str~~ |
| `SPACY` | Token has a trailing space. ~~bool~~ |
| `TAG` | The token's fine-grained part of speech. ~~str~~ |
| Attribute | Description |
| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `DEP` | The token's dependency label. ~~str~~ |
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
| `ENT_TYPE` | The token's entity label. ~~str~~ |
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ |
| `IS_DIGIT` | Token text consists of digits. ~~bool~~ |
| `IS_LOWER` | Token text is in lowercase. ~~bool~~ |
| `IS_PUNCT` | Token is punctuation. ~~bool~~ |
| `IS_SPACE` | Token is whitespace. ~~bool~~ |
| `IS_STOP` | Token is a stop word. ~~bool~~ |
| `IS_TITLE` | Token text is in titlecase. ~~bool~~ |
| `IS_UPPER` | Token text is in uppercase. ~~bool~~ |
| `LEMMA` | The token's lemma. ~~str~~ |
| `LENGTH` | The length of the token text. ~~int~~ |
| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ |
| `LIKE_NUM` | Token text resembles a number. ~~bool~~ |
| `LIKE_URL` | Token text resembles a URL. ~~bool~~ |
| `LOWER` | The lowercase form of the token text. ~~str~~ |
| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ |
| `NORM` | The normalized form of the token text. ~~str~~ |
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `POS` | The token's universal part of speech (UPOS). ~~str~~ |
| `SENT_START` | Token is start of sentence. ~~bool~~ |
| `SHAPE` | The token's shape. ~~str~~ |
| `SPACY` | Token has a trailing space. ~~bool~~ |
| `TAG` | The token's fine-grained part of speech. ~~str~~ |

View File

@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
✔ Good amount of examples for all labels
✔ Examples without occurences available for all labels
✔ Examples without occurrences available for all labels
✔ No entities consisting of or starting/ending with whitespace
=========================== Part-of-speech Tagging ===========================
@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
## find-threshold {id="find-threshold",version="3.5",tag="command"}
Runs prediction trials for a trained model with varying tresholds to maximize
Runs prediction trials for a trained model with varying thresholds to maximize
the specified metric. The search space for the threshold is traversed linearly
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`

View File

@ -61,13 +61,13 @@ architectures and their arguments and hyperparameters.
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ |
| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `get_candidates_batch` <Tag variant="new">3.5</Tag> | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ |
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
@ -100,21 +100,21 @@ custom knowledge base, you should either call
[`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the
[`initialize`](/api/entitylinker#initialize) call.
| Name | Description |
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
| Name | Description |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ |
| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ |
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ |
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}

View File

@ -58,7 +58,7 @@ how the component should be configured. You can override its settings via the
| Setting | Description |
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
@ -92,7 +92,7 @@ be a token pattern (list) or a phrase pattern (string). For example:
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | |
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
with `overwrite_ents=True`, existing entities will be replaced if they overlap
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
longer patterns over shorter, and if equal the match occuring first in the Doc
longer patterns over shorter, and if equal the match occurring first in the Doc
is chosen.
> #### Example

View File

@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis.
> assert "Feat1=Val1" in morph
> ```
| Name | Description |
| ----------- | --------------------------------------------- |
| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
| Name | Description |
| ------------ | --------------------------------------------------------------------- |
| `feature` | A feature/value pair. ~~str~~ |
| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ |
### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"}

View File

@ -288,7 +288,7 @@ it so no NP-level coordination, no prepositional phrases, and no relative
clauses.
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
has not been implemeted for the given language, a `NotImplementedError` is
has not been implemented for the given language, a `NotImplementedError` is
raised.
> #### Example

View File

@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
| `width` | The width of the last hidden layer. ~~int~~ |
### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
Create an empty `TransformerData` container.

View File

@ -832,7 +832,7 @@ retrieve and add to them.
After creation, the component needs to be
[initialized](/usage/training#initialization). This method can define the
relevant labels in two ways: explicitely by setting the `labels` argument in the
relevant labels in two ways: explicitly by setting the `labels` argument in the
[`initialize` block](/api/data-formats#config-initialize) of the config, or
implicately by deducing them from the `get_examples` callback that generates the
full **training data set**, or a representative sample.

View File

@ -1899,7 +1899,7 @@ the two words.
"Shore": ("coast", 0.732257),
"Precautionary": ("caution", 0.490973),
"hopelessness": ("sadness", 0.742366),
"Continous": ("continuous", 0.732549),
"Continuous": ("continuous", 0.732549),
"Disemboweled": ("corpse", 0.499432),
"biostatistician": ("scientist", 0.339724),
"somewheres": ("somewheres", 0.402736),

View File

@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
dependency check, set `check_requirements: false` in your project's
`project.yml`.
### 4. Run a workflow {id="run-workfow"}
### 4. Run a workflow {id="run-workflow"}
> #### project.yml
>
@ -286,7 +286,7 @@ pipelines.
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |

View File

@ -306,7 +306,9 @@ installed in the same environment that's it.
### Loading probability tables into existing models
You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
You can load a probability table from
[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
existing spaCy model like `en_core_web_sm`.
```python
# Requirements: pip install spacy-lookups-data
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
```
When training a model from scratch you can also specify probability tables in the `config.cfg`.
When training a model from scratch you can also specify probability tables in
the `config.cfg`.
```ini {title="config.cfg (excerpt)"}
[initialize.lookups]
@ -346,8 +349,8 @@ them**!
To stick with the theme of
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
consider the following custom spaCy
[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
snake when it's called:
[pipeline component](/usage/processing-pipelines#custom-components) that prints
a snake when it's called:
> #### Package directory structure
>

View File

@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
✔ Good amount of examples for all labels
✔ Examples without occurences available for all labels
✔ Examples without occurrences available for all labels
✔ No entities consisting of or starting/ending with whitespace
=========================== Part-of-speech Tagging ===========================

View File

@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
in the [transformer API docs](/api/architectures#TransformerModel).
`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
`spacy-transformers` v1.1 also adds support for `transformer_config` settings
such as `output_attentions`. Additional output is stored under
`TransformerData.model_output`. More details are in the
[TransformerModel docs](/api/architectures#TransformerModel). The training speed

View File

@ -31,6 +31,12 @@
"name": "Bengali",
"has_examples": true
},
{
"code": "bo",
"name": "Tibetan",
"example": "འདི་ཚིག་གྲུབ་རེད།",
"has_examples": true
},
{
"code": "ca",
"name": "Catalan",
@ -480,6 +486,12 @@
],
"example": "这是一个用于示例的句子。",
"has_examples": true
},
{
"code": "kmr",
"name": "Kurdish Kurmanji",
"example": "Ev hevokek e",
"has_examples": true
}
],
"licenses": [

File diff suppressed because it is too large Load Diff

View File

@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
<Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
💥 Interested in <strong>Premium spaCy Models</strong>?
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
💥 <strong>New:</strong> Case study with S&P Global
</Link>
)