Merge branch 'master' into universe-project-quelquhui

This commit is contained in:
Ines Montani 2024-09-10 14:17:12 +02:00 committed by GitHub
commit f6ee0dfab1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
41 changed files with 3455 additions and 74 deletions

92
.github/workflows/cibuildwheel.yml vendored Normal file
View File

@ -0,0 +1,92 @@
name: Build
on:
push:
tags:
# ytf did they invent their own syntax that's almost regex?
# ** matches 'zero or more of any character'
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
jobs:
build_wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
# macos-13 is an intel runner, macos-14 is apple silicon
os: [ubuntu-latest, windows-latest, macos-13]
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: pypa/cibuildwheel@v2.19.1
env:
CIBW_SOME_OPTION: value
with:
package-dir: .
output-dir: wheelhouse
config-file: "{package}/pyproject.toml"
- uses: actions/upload-artifact@v4
with:
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
path: ./wheelhouse/*.whl
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
run: pipx run build --sdist
- uses: actions/upload-artifact@v4
with:
name: cibw-sdist
path: dist/*.tar.gz
create_release:
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
permissions:
contents: write
checks: write
actions: read
issues: read
packages: write
pull-requests: read
repository-projects: read
statuses: read
steps:
- name: Get the tag name and determine if it's a prerelease
id: get_tag_info
run: |
FULL_TAG=${GITHUB_REF#refs/tags/}
if [[ $FULL_TAG == release-* ]]; then
TAG_NAME=${FULL_TAG#release-}
IS_PRERELEASE=false
elif [[ $FULL_TAG == prerelease-* ]]; then
TAG_NAME=${FULL_TAG#prerelease-}
IS_PRERELEASE=true
else
echo "Tag does not match expected patterns" >&2
exit 1
fi
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
# unpacks all CIBW artifacts into dist/
pattern: cibw-*
path: dist
merge-multiple: true
- name: Create Draft Release
id: create_release
uses: softprops/action-gh-release@v2
if: startsWith(github.ref, 'refs/tags/')
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
name: ${{ env.TAG_NAME }}
draft: true
prerelease: ${{ env.IS_PRERELEASE }}
files: "./dist/*"

29
.github/workflows/publish_pypi.yml vendored Normal file
View File

@ -0,0 +1,29 @@
# The cibuildwheel action triggers on creation of a release, this
# triggers on publication.
# The expected workflow is to create a draft release and let the wheels
# upload, and then hit 'publish', which uploads to PyPi.
on:
release:
types:
- published
jobs:
upload_pypi:
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/spacy
permissions:
id-token: write
contents: read
if: github.event_name == 'release' && github.event.action == 'published'
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
steps:
- uses: robinraju/release-downloader@v1
with:
tag: ${{ github.event.release.tag_name }}
fileName: '*'
out-file-path: 'dist'
- uses: pypa/gh-action-pypi-publish@release/v1

View File

@ -11,5 +11,58 @@ requires = [
]
build-backend = "setuptools.build_meta"
[tool.cibuildwheel]
build = "*"
skip = "pp* cp36* cp37* cp38* *-win32"
test-skip = ""
free-threaded-support = false
archs = ["native"]
build-frontend = "default"
config-settings = {}
dependency-versions = "pinned"
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
environment-pass = []
build-verbosity = 0
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
before-build = "pip install -r requirements.txt && python setup.py clean"
repair-wheel-command = ""
test-command = ""
before-test = ""
test-requires = []
test-extras = []
container-engine = "docker"
manylinux-x86_64-image = "manylinux2014"
manylinux-i686-image = "manylinux2014"
manylinux-aarch64-image = "manylinux2014"
manylinux-ppc64le-image = "manylinux2014"
manylinux-s390x-image = "manylinux2014"
manylinux-pypy_x86_64-image = "manylinux2014"
manylinux-pypy_i686-image = "manylinux2014"
manylinux-pypy_aarch64-image = "manylinux2014"
musllinux-x86_64-image = "musllinux_1_2"
musllinux-i686-image = "musllinux_1_2"
musllinux-aarch64-image = "musllinux_1_2"
musllinux-ppc64le-image = "musllinux_1_2"
musllinux-s390x-image = "musllinux_1_2"
[tool.cibuildwheel.linux]
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
[tool.cibuildwheel.macos]
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
[tool.cibuildwheel.windows]
[tool.cibuildwheel.pyodide]
[tool.isort]
profile = "black"

View File

@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0

View File

@ -66,7 +66,6 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]

View File

@ -1,5 +1,5 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.7.5"
__version__ = "3.8.0.dev0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

16
spacy/lang/bo/__init__.py Normal file
View File

@ -0,0 +1,16 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
class TibetanDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Tibetan(Language):
lang = "bo"
Defaults = TibetanDefaults
__all__ = ["Tibetan"]

16
spacy/lang/bo/examples.py Normal file
View File

@ -0,0 +1,16 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.bo.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
]

View File

@ -0,0 +1,65 @@
from ...attrs import LIKE_NUM
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
_num_words = [
"ཀླད་ཀོར་",
"གཅིག་",
"གཉིས་",
"གསུམ་",
"བཞི་",
"ལྔ་",
"དྲུག་",
"བདུན་",
"བརྒྱད་",
"དགུ་",
"བཅུ་",
"བཅུ་གཅིག་",
"བཅུ་གཉིས་",
"བཅུ་གསུམ་",
"བཅུ་བཞི་",
"བཅུ་ལྔ་",
"བཅུ་དྲུག་",
"བཅུ་བདུན་",
"བཅུ་པརྒྱད",
"བཅུ་དགུ་",
"ཉི་ཤུ་",
"སུམ་ཅུ",
"བཞི་བཅུ",
"ལྔ་བཅུ",
"དྲུག་ཅུ",
"བདུན་ཅུ",
"བརྒྱད་ཅུ",
"དགུ་བཅུ",
"བརྒྱ་",
"སྟོང་",
"ཁྲི་",
"ས་ཡ་",
" བྱེ་བ་",
"དུང་ཕྱུར་",
"ཐེར་འབུམ་",
"ཐེར་འབུམ་ཆེན་པོ་",
"ཁྲག་ཁྲིག་",
"ཁྲག་ཁྲིག་ཆེན་པོ་",
]
def like_num(text):
"""
Check if text resembles a number
"""
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

198
spacy/lang/bo/stop_words.py Normal file
View File

@ -0,0 +1,198 @@
# Source: https://zenodo.org/records/10148636
STOP_WORDS = set(
"""
གས
མས
འད
པས
གཞན
དང
གས
བཅས
ངས
ལས
ཙམ
ཡང
མཐའདག
འད
རང
ངམ
དག
འང
ལགས
ཚང
ཐམསཅད
དམ
འམ
བས
ལགས
གས
མས
བམ
ནམ
ནམ
ངམ
འགའ
ཤས
གམ
ལགས
ཅང
འགའ
སམ
འང
ལས
འཕ
བར
དང
འག
སམ
ཟད
འམ
མམ
དམ
དག
ལམ
ནང
ཙམ
རམ
ཨང
གས
ལགས
པས
རབ
རམ
བས
གཞན
འབའ
གམ
བམ
ཙམ
མམ
ཏམ
ཏམ
ཤས
""".split()
)

18
spacy/lang/gd/__init__.py Normal file
View File

@ -0,0 +1,18 @@
from typing import Optional
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
class ScottishDefaults(BaseDefaults):
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
class Scottish(Language):
lang = "gd"
Defaults = ScottishDefaults
__all__ = ["Scottish"]

388
spacy/lang/gd/stop_words.py Normal file
View File

@ -0,0 +1,388 @@
STOP_WORDS = set(
"""
'ad
'ar
'd # iad
'g # ag
'ga
'gam
'gan
'gar
'gur
'm # am
'n # an
'n seo
'na
'nad
'nam
'nan
'nar
'nuair
'nur
's
'sa
'san
'sann
'se
'sna
a
a'
a'd # agad
a'm # agam
a-chèile
a-seo
a-sin
a-siud
a chionn
a chionn 's
a chèile
a chéile
a dh'
a h-uile
a seo
ac' # aca
aca
aca-san
acasan
ach
ag
agad
agad-sa
agads'
agadsa
agaibh
agaibhse
againn
againne
agam
agam-sa
agams'
agamsa
agus
aice
aice-se
aicese
aig
aig' # aige
aige
aige-san
aigesan
air
air-san
air neo
airsan
am
an
an seo
an sin
an siud
an uair
ann
ann a
ann a'
ann a shin
ann am
ann an
annad
annam
annam-s'
annamsa
anns
anns an
annta
aon
ar
as
asad
asda
asta
b'
bho
bhon
bhuaidhe # bhuaithe
bhuainn
bhuaipe
bhuaithe
bhuapa
bhur
brì
bu
c'à
car son
carson
cha
chan
chionn
choir
chon
chun
chèile
chéile
chòir
cia mheud
ciamar
co-dhiubh
cuide
cuin
cuin'
cuine
'
càil
càit
càit'
càite
mheud
d'
da
de
dh'
dha
dhaibh
dhaibh-san
dhaibhsan
dhan
dhasan
dhe
dhen
dheth
dhi
dhiom
dhiot
dhith
dhiubh
dhomh
dhomh-s'
dhomhsa
dhu'sa # dhut-sa
dhuibh
dhuibhse
dhuinn
dhuinne
dhuit
dhut
dhutsa
dhut-sa
dhà
dhà-san
dhàsan
dhòmhsa
diubh
do
docha
don
mar
mar
dòch'
dòcha
e
eadar
eatarra
eatorra
eile
esan
fa
far
feud
fhad
fheudar
fhearr
fhein
fheudar
fheàrr
fhèin
fhéin
fhìn
fo
fodha
fodhainn
foipe
fon
fèin
ga
gach
gam
gan
ge brith
ged
gu
gu
gu ruige
gun
gur
gus
i
iad
iadsan
innte
is
ise
le
leam
leam-sa
leamsa
leat
leat-sa
leatha
leatsa
leibh
leis
leis-san
leoth'
leotha
leotha-san
linn
m'
m'a
ma
mac
man
mar
mas
mathaid
mi
mis'
mise
mo
mu
mu 'n
mun
mur
mura
mus
na
na b'
na bu
na iad
nach
nad
nam
nan
nar
nas
neo
no
nuair
o
o'n
oir
oirbh
oirbh-se
oirnn
oirnne
oirre
on
orm
orm-sa
ormsa
orra
orra-san
orrasan
ort
os
r'
ri
ribh
rinn
ris
rithe
rithe-se
rium
rium-sa
riums'
riumsa
riut
riuth'
riutha
riuthasan
ro
ro'n
roimh
roimhe
romhainn
romham
romhpa
ron
ruibh
ruinn
ruinne
sa
san
sann
se
seach
seo
seothach
shin
sibh
sibh-se
sibhse
sin
sineach
sinn
sinne
siod
siodach
siud
siudach
sna # ann an
t'
tarsaing
tarsainn
tarsuinn
thar
thoigh
thro
thu
thuc'
thuca
thugad
thugaibh
thugainn
thugam
thugamsa
thuice
thuige
thus'
thusa
timcheall
toigh
toil
tro
tro' # troimh
troimh
troimhe
tron
tu
tusa
uair
ud
ugaibh
ugam-s'
ugam-sa
uice
uige
uige-san
umad
unnta # ann an
ur
urrainn
à
às
àsan
á
ás
è
ì
ò
ó
""".split(
"\n"
)
)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
class KurmanjiDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS
class Kurmanji(Language):
lang = "kmr"
Defaults = KurmanjiDefaults
__all__ = ["Kurmanji"]

View File

@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.kmr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
]

138
spacy/lang/kmr/lex_attrs.py Normal file
View File

@ -0,0 +1,138 @@
from ...attrs import LIKE_NUM
_num_words = [
"sifir",
"yek",
"du",
"",
"çar",
"pênc",
"şeş",
"heft",
"heşt",
"neh",
"deh",
"yazde",
"dazde",
"sêzde",
"çarde",
"pazde",
"şazde",
"hevde",
"hejde",
"nozde",
"bîst",
"",
"çil",
"pêncî",
"şêst",
"heftê",
"heştê",
"nod",
"sed",
"hezar",
"milyon",
"milyar",
]
_ordinal_words = [
"yekem",
"yekemîn",
"duyem",
"duyemîn",
"sêyem",
"sêyemîn",
"çarem",
"çaremîn",
"pêncem",
"pêncemîn",
"şeşem",
"şeşemîn",
"heftem",
"heftemîn",
"heştem",
"heştemîn",
"nehem",
"nehemîn",
"dehem",
"dehemîn",
"yazdehem",
"yazdehemîn",
"dazdehem",
"dazdehemîn",
"sêzdehem",
"sêzdehemîn",
"çardehem",
"çardehemîn",
"pazdehem",
"pazdehemîn",
"şanzdehem",
"şanzdehemîn",
"hevdehem",
"hevdehemîn",
"hejdehem",
"hejdehemîn",
"nozdehem",
"nozdehemîn",
"bîstem",
"bîstemîn",
"sîyem",
"sîyemîn",
"çilem",
"çilemîn",
"pêncîyem",
"pênciyemîn",
"şêstem",
"şêstemîn",
"heftêyem",
"heftêyemîn",
"heştêyem",
"heştêyemîn",
"notem",
"notemîn",
"sedem",
"sedemîn",
"hezarem",
"hezaremîn",
"milyonem",
"milyonemîn",
"milyarem",
"milyaremîn",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# Check ordinal number
if text_lower in _ordinal_words:
return True
if is_digit(text_lower):
return True
return False
def is_digit(text):
endings = ("em", "yem", "emîn", "yemîn")
for ending in endings:
to = len(ending)
if text.endswith(ending) and text[:-to].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,44 @@
STOP_WORDS = set(
"""
û
li
bi
di
da
de
ji
ku
ew
ez
tu
em
hûn
ew
ev
min
te
me
we
wan
va
çi
çawa
çima
kengî
li ku
çend
çiqas
her
hin
gelek
hemû
kes
tişt
""".split()
)

View File

@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = Lookups()
return MacedonianLemmatizer(lookups)
class Macedonian(Language):
lang = "mk"
Defaults = MacedonianDefaults

View File

@ -5,7 +5,7 @@ import multiprocessing as mp
import random
import traceback
import warnings
from contextlib import contextmanager
from contextlib import ExitStack, contextmanager
from copy import deepcopy
from dataclasses import dataclass
from itertools import chain, cycle
@ -31,6 +31,7 @@ from typing import (
)
import srsly
from cymem.cymem import Pool
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
from . import about, ty, util
@ -2091,6 +2092,38 @@ class Language:
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
tok2vec.remove_listener(listener, pipe_name)
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
Example
-------
>>> with nlp.memory_zone():
... for doc in nlp.pipe(texts):
... process_my_doc(doc)
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
"""
if mem is None:
mem = Pool()
# The ExitStack allows programmatic nested context managers.
# We don't know how many we need, so it would be awkward to have
# them as nested blocks.
with ExitStack() as stack:
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
if hasattr(self.tokenizer, "memory_zone"):
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
for _, pipe in self.pipeline:
if hasattr(pipe, "memory_zone"):
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
yield mem
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:

View File

@ -203,7 +203,7 @@ cdef class ArcEagerGold:
def __init__(self, ArcEager moves, StateClass stcls, Example example):
self.mem = Pool()
heads, labels = example.get_aligned_parse(projectivize=True)
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
sent_starts = _get_aligned_sent_starts(example)
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)

View File

@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
new_label, head_label = label.split(DELIMITER)
new_head = _find_new_head(doc[i], head_label)
doc.c[i].head = new_head.i - i
doc.c[i].dep = doc.vocab.strings.add(new_label)
doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
set_children_from_heads(doc.c, 0, doc.length)
return doc

View File

@ -25,5 +25,7 @@ cdef class StringStore:
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, str py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
cdef vector[hash_t] _transient_keys
cdef Pool _non_temp_mem

View File

@ -1,9 +1,14 @@
# cython: infer_types=True
# cython: profile=False
cimport cython
from contextlib import contextmanager
from typing import Iterator, List, Optional
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from murmurhash.mrmr cimport hash32, hash64
from preshed.maps cimport map_clear
import srsly
@ -31,7 +36,7 @@ def get_string_id(key):
This function optimises for convenience over performance, so shouldn't be
used in tight loops.
"""
cdef hash_t str_hash
cdef hash_t str_hash
if isinstance(key, str):
if len(key) == 0:
return 0
@ -45,8 +50,8 @@ def get_string_id(key):
elif _try_coerce_to_hash(key, &str_hash):
# Coerce the integral key to the expected primitive hash type.
# This ensures that custom/overloaded "primitive" data types
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# such as those implemented by numpy are not inadvertently used
# downsteam (as these are internally implemented as custom PyObjects
# whose comparison operators can incur a significant overhead).
return str_hash
else:
@ -119,10 +124,11 @@ cdef class StringStore:
strings (iterable): A sequence of unicode strings to add to the store.
"""
self.mem = Pool()
self._non_temp_mem = self.mem
self._map = PreshMap()
if strings is not None:
for string in strings:
self.add(string)
self.add(string, allow_transient=False)
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash, or vice versa.
@ -152,14 +158,17 @@ cdef class StringStore:
return SYMBOLS_BY_INT[str_hash]
else:
utf8str = <Utf8Str*>self._map.get(str_hash)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
else:
# TODO: Raise an error instead
utf8str = <Utf8Str*>self._map.get(string_or_id)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
if utf8str is NULL:
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
@ -175,12 +184,46 @@ cdef class StringStore:
else:
return self[key]
def add(self, string):
def __len__(self) -> int:
"""The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.keys.size() + self._transient_keys.size()
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
"""Begin a block where all resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
"""
if mem is None:
mem = Pool()
self.mem = mem
yield mem
for key in self._transient_keys:
map_clear(self._map.c_map, key)
self._transient_keys.clear()
self.mem = self._non_temp_mem
def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
"""Add a string to the StringStore.
string (str): The string to add.
allow_transient (bool): Allow the string to be stored in the 'transient'
map, which will be flushed at the end of the memory zone. Strings
encountered during arbitrary text processing should be added
with allow_transient=True, while labels and other strings used
internally should not.
RETURNS (uint64): The string's hash value.
"""
if allow_transient is None:
allow_transient = self.mem is not self._non_temp_mem
cdef hash_t str_hash
if isinstance(string, str):
if string in SYMBOLS_BY_STR:
@ -188,22 +231,26 @@ cdef class StringStore:
string = string.encode("utf8")
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
self._intern_utf8(string, len(string), &str_hash, allow_transient)
elif isinstance(string, bytes):
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
str_hash = hash_utf8(string, len(string))
self._intern_utf8(string, len(string), &str_hash)
self._intern_utf8(string, len(string), &str_hash, allow_transient)
else:
raise TypeError(Errors.E017.format(value_type=type(string)))
return str_hash
def __len__(self):
"""The number of strings in the store.
if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string]
else:
return self._intern_str(string, allow_transient)
RETURNS (int): The number of strings in the store.
"""
return self.keys.size()
return self.keys.size() + self._transient_keys.size()
def __contains__(self, string_or_id not None):
"""Check whether a string or ID is in the store.
@ -222,12 +269,17 @@ cdef class StringStore:
pass
else:
# TODO: Raise an error instead
return self._map.get(string_or_id) is not NULL
if self._map.get(string_or_id) is not NULL:
return True
else:
return False
if str_hash < len(SYMBOLS_BY_INT):
return True
else:
return self._map.get(str_hash) is not NULL
if self._map.get(str_hash) is not NULL:
return True
else:
return False
def __iter__(self):
"""Iterate over the strings in the store, in order.
@ -240,12 +292,29 @@ cdef class StringStore:
key = self.keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here?
for i in range(self._transient_keys.size()):
key = self._transient_keys[i]
utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str)
def __reduce__(self):
strings = list(self)
return (StringStore, (strings,), None, None, None)
def values(self) -> List[int]:
"""Iterate over the stored strings hashes in insertion order.
RETURNS: A list of string hashs.
"""
cdef int i
hashes = [None] * self._keys.size()
for i in range(self._keys.size()):
hashes[i] = self._keys[i]
transient_hashes = [None] * self._transient_keys.size()
for i in range(self._transient_keys.size()):
transient_hashes[i] = self._transient_keys[i]
return hashes + transient_hashes
def to_disk(self, path):
"""Save the current state to a directory.
@ -269,7 +338,7 @@ cdef class StringStore:
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
self.add(word, allow_transient=False)
return self
def to_bytes(self, **kwargs):
@ -289,23 +358,25 @@ cdef class StringStore:
prev = list(self)
self._reset_and_load(strings)
for word in prev:
self.add(word)
self.add(word, allow_transient=False)
return self
def _reset_and_load(self, strings):
self.mem = Pool()
self._non_temp_mem = self.mem
self._map = PreshMap()
self.keys.clear()
self._transient_keys.clear()
for string in strings:
self.add(string)
self.add(string, allow_transient=False)
cdef const Utf8Str* intern_unicode(self, str py_string):
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
# 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode("utf8")
return self._intern_utf8(byte_string, len(byte_string), NULL)
return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
@cython.final
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
# TODO: This function's API/behaviour is an unholy mess...
# 0 means missing, but we don't bother offsetting the index.
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
@ -314,5 +385,8 @@ cdef class StringStore:
return value
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
self.keys.push_back(key)
if allow_transient and self.mem is not self._non_temp_mem:
self._transient_keys.push_back(key)
else:
self.keys.push_back(key)
return value

View File

@ -81,6 +81,11 @@ def bn_tokenizer():
return get_lang_class("bn")().tokenizer
@pytest.fixture(scope="session")
def bo_tokenizer():
return get_lang_class("bo")().tokenizer
@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class("ca")().tokenizer

View File

View File

@ -0,0 +1,21 @@
import pytest
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("999.0", True),
("གཅིག་", True),
("གཉིས་", True),
("ཀླད་ཀོར་", True),
("བཅུ་གཅིག་", True),
("ཁྱི་", False),
(",", False),
],
)
def test_lex_attrs_like_number(bo_tokenizer, text, match):
tokens = bo_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match

View File

View File

@ -0,0 +1,27 @@
import pytest
from spacy.lang.kmr.lex_attrs import like_num
@pytest.mark.parametrize(
"word",
[
"yekem",
"duyemîn",
"100em",
"dehem",
"sedemîn",
"34em",
"30yem",
"20emîn",
"50yemîn",
],
)
def test_kmr_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)
@pytest.mark.parametrize("word", ["deh"])
def test_kmr_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"]
"tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
# fmt: on

View File

@ -18,6 +18,7 @@ LANGUAGES = [
pytest.param("ar", marks=pytest.mark.slow()),
pytest.param("bg", marks=pytest.mark.slow()),
"bn",
pytest.param("bo", marks=pytest.mark.slow()),
pytest.param("ca", marks=pytest.mark.slow()),
pytest.param("cs", marks=pytest.mark.slow()),
pytest.param("da", marks=pytest.mark.slow()),
@ -57,6 +58,7 @@ LANGUAGES = [
pytest.param("tr", marks=pytest.mark.slow()),
pytest.param("tt", marks=pytest.mark.slow()),
pytest.param("ur", marks=pytest.mark.slow()),
pytest.param("kmr", marks=pytest.mark.slow()),
]

View File

@ -0,0 +1,36 @@
from spacy.vocab import Vocab
def test_memory_zone_no_insertion():
vocab = Vocab()
with vocab.memory_zone():
pass
lex = vocab["horse"]
assert lex.text == "horse"
def test_memory_zone_insertion():
vocab = Vocab()
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab
with vocab.memory_zone():
lex = vocab["horse"]
assert lex.text == "horse"
assert "dog" in vocab
assert "horse" not in vocab
def test_memory_zone_redundant_insertion():
"""Test that if we insert an already-existing word while
in the memory zone, it stays persistent"""
vocab = Vocab()
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab
with vocab.memory_zone():
lex = vocab["horse"]
assert lex.text == "horse"
_ = vocab["dog"]
assert "dog" in vocab
assert "horse" not in vocab

View File

@ -25,9 +25,7 @@ cdef class Tokenizer:
cdef PhraseMatcher _special_matcher
# TODO convert to bool in v4
cdef int _faster_heuristics
# TODO next one is unused and should be removed in v4
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int2
cdef public int max_cache_size
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1

View File

@ -30,7 +30,7 @@ cdef class Tokenizer:
"""
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
suffix_search=None, infix_finditer=None, token_match=None,
url_match=None, faster_heuristics=True):
url_match=None, faster_heuristics=True, max_cache_size=10000):
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
vocab (Vocab): A storage container for lexical types.
@ -50,6 +50,7 @@ cdef class Tokenizer:
faster_heuristics (bool): Whether to restrict the final
Matcher-based pass for rules to those containing affixes or space.
Defaults to True.
max_cache_size (int): Maximum number of tokenization chunks to cache.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
@ -69,6 +70,7 @@ cdef class Tokenizer:
self._rules = {}
self._special_matcher = PhraseMatcher(self.vocab)
self._load_special_cases(rules)
self.max_cache_size = max_cache_size
@property
def token_match(self):
@ -397,8 +399,9 @@ cdef class Tokenizer:
has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
if len(self._cache) < self.max_cache_size:
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef str _split_affixes(
self,
@ -514,9 +517,8 @@ cdef class Tokenizer:
if n <= 0:
# avoid mem alloc of zero length
return 0
for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0
if self.vocab.in_memory_zone:
return 0
# See #1250
if has_special[0]:
return 0

View File

@ -41,7 +41,9 @@ cdef class Vocab:
cdef const TokenC* make_fused_token(self, substrings) except NULL
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
cdef PreshMap _by_orth
cdef Pool _non_temp_mem
cdef vector[attr_t] _transient_orths

View File

@ -1,6 +1,8 @@
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
from cymem.cymem import Pool
from thinc.types import Floats1d, FloatsXd
from . import Language
@ -67,6 +69,8 @@ class Vocab:
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
) -> Vocab: ...
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
def pickle_vocab(vocab: Vocab) -> Any: ...
def unpickle_vocab(

View File

@ -1,8 +1,11 @@
import functools
from contextlib import ExitStack, contextmanager
from typing import Iterator, Optional
import numpy
import srsly
from thinc.api import get_array_module, get_current_ops
from preshed.maps cimport map_clear
from .attrs cimport LANG, ORTH
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
@ -87,6 +90,12 @@ cdef class Vocab:
self.lookups = lookups
self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks
# During a memory_zone we replace our mem object with one
# that's passed to us. We keep a reference to our non-temporary
# memory here, in case we need to make an allocation we want to
# guarantee is not temporary. This is also how we check whether
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
self._non_temp_mem = self.mem
@property
def vectors(self):
@ -96,7 +105,7 @@ cdef class Vocab:
def vectors(self, vectors):
if hasattr(vectors, "strings"):
for s in vectors.strings:
self.strings.add(s)
self.strings.add(s, allow_transient=False)
self._vectors = vectors
self._vectors.strings = self.strings
@ -107,6 +116,10 @@ cdef class Vocab:
langfunc = self.lex_attr_getters.get(LANG, None)
return langfunc("_") if langfunc else ""
@property
def in_memory_zone(self) -> bool:
return self.mem is not self._non_temp_mem
def __len__(self):
"""The current number of lexemes stored.
@ -114,6 +127,33 @@ cdef class Vocab:
"""
return self.length
@contextmanager
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
"""Begin a block where resources allocated during the block will
be freed at the end of it. If a resources was created within the
memory zone block, accessing it outside the block is invalid.
Behaviour of this invalid access is undefined. Memory zones should
not be nested.
The memory zone is helpful for services that need to process large
volumes of text with a defined memory budget.
"""
if mem is None:
mem = Pool()
# The ExitStack allows programmatic nested context managers.
# We don't know how many we need, so it would be awkward to have
# them as nested blocks.
with ExitStack() as stack:
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
if hasattr(self.morphology, "memory_zone"):
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
if hasattr(self._vectors, "memory_zone"):
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
self.mem = mem
yield mem
self._clear_transient_orths()
self.mem = self._non_temp_mem
def add_flag(self, flag_getter, int flag_id=-1):
"""Set a new boolean flag to words in the vocabulary.
@ -148,8 +188,7 @@ cdef class Vocab:
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
`Lexeme` if necessary.
"""
if string == "":
return &EMPTY_LEXEME
@ -180,19 +219,11 @@ cdef class Vocab:
return self._new_lexeme(mem, self.strings[orth])
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
# I think this heuristic is bad, and the Vocab should always
# own the lexemes. It avoids weird bugs this way, as it's how the thing
# was originally supposed to work. The best solution to the growing
# memory use is to periodically reset the vocab, which is an action
# that should be up to the user to do (so we don't need to keep track
# of the doc ownership).
# TODO: Change the C API so that the mem isn't passed in here.
# The mem argument is deprecated, replaced by memory zones. Same with
# this size heuristic.
mem = self.mem
# if len(string) < 3 or self.length < 10000:
# mem = self.mem
cdef bint is_oov = mem is not self.mem
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
lex.orth = self.strings.add(string)
lex.orth = self.strings.add(string, allow_transient=True)
lex.length = len(string)
if self.vectors is not None and hasattr(self.vectors, "key2row"):
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
@ -202,18 +233,25 @@ cdef class Vocab:
for attr, func in self.lex_attr_getters.items():
value = func(string)
if isinstance(value, str):
value = self.strings.add(value)
value = self.strings.add(value, allow_transient=True)
if value is not None:
Lexeme.set_struct_attr(lex, attr, value)
if not is_oov:
self._add_lex_to_vocab(lex.orth, lex)
self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
self._by_orth.set(lex.orth, <void*>lex)
self.length += 1
if is_transient and self.in_memory_zone:
self._transient_orths.push_back(lex.orth)
def _clear_transient_orths(self):
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
for orth in self._transient_orths:
map_clear(self._by_orth.c_map, orth)
self._transient_orths.clear()
def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
@ -265,7 +303,7 @@ cdef class Vocab:
"""
cdef attr_t orth
if isinstance(id_or_string, str):
orth = self.strings.add(id_or_string)
orth = self.strings.add(id_or_string, allow_transient=True)
else:
orth = id_or_string
return Lexeme(self, orth)
@ -417,7 +455,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#get_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
orth = self.strings.add(orth, allow_transient=True)
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.has_vector(key):
@ -436,7 +474,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#set_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
orth = self.strings.add(orth, allow_transient=False)
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
if self.vectors.is_full and key not in self.vectors:
@ -460,7 +498,7 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab#has_vector
"""
if isinstance(orth, str):
orth = self.strings.add(orth)
orth = self.strings.add(orth, allow_transient=True)
cdef Lexeme lex = self[orth]
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
return key in self.vectors

View File

@ -31,6 +31,12 @@
"name": "Bengali",
"has_examples": true
},
{
"code": "bo",
"name": "Tibetan",
"example": "འདི་ཚིག་གྲུབ་རེད།",
"has_examples": true
},
{
"code": "ca",
"name": "Catalan",
@ -480,6 +486,12 @@
],
"example": "这是一个用于示例的句子。",
"has_examples": true
},
{
"code": "kmr",
"name": "Kurdish Kurmanji",
"example": "Ev hevokek e",
"has_examples": true
}
],
"licenses": [

View File

@ -16,6 +16,40 @@
},
"category": ["extension"],
"tags": []
},
{
"id": "constituent_treelib",
"title": "Constituent Treelib",
"slogan": "Extract constituents with ease!",
"description": "Constituent Treelib (CTL) is a lightweight Python library built on top of benepar (Berkeley Neural Parser) as well as the two well-known NLP frameworks spaCy and NLTK. CTL offers you a convenient way to parse sentences into constituent trees, modify them according to their structure, as well as visualize and export them into various file formats. In addition, you can extract phrases according to their phrasal categories (which can be used e.g., as features for various NLP tasks), validate already parsed sentences in bracket notation or convert them back into sentences.",
"github": "Halvani/Constituent-Treelib",
"pip": "constituent-treelib",
"code_example": [
"from constituent_treelib import ConstituentTree, Language",
"# Define the language for the sentence as well as for the spaCy and benepar models",
"language = Language.English",
"# Define which specific SpaCy model should be used (default is Medium)",
"spacy_model_size = ConstituentTree.SpacyModelSize.Medium",
"# Create the pipeline (note, the required models will be downloaded and installed automatically)",
"nlp = ConstituentTree.create_pipeline(language, spacy_model_size)",
"# Your sentence",
"sentence = 'We try to explicitly describe the geometry of the edges of the images.'",
"# Create the tree from where we are going to extract the desired noun phrases",
"tree = ConstituentTree(sentence, nlp)",
"all_phrases = tree.extract_all_phrases(min_words_in_phrases=1)",
"print(all_phrases)",
"# {'PP': ['of the edges of the images', 'of the images'], 'NP': ['We', 'the geometry of the edges of the images', 'the geometry', 'the edges of the images', 'the edges', 'the images'], 'S': ['We try to explicitly describe the geometry of the edges of the images .', 'to explicitly describe the geometry of the edges of the images'], 'VP': ['try to explicitly describe the geometry of the edges of the images', 'to explicitly describe the geometry of the edges of the images', 'describe the geometry of the edges of the images'], 'ADVP': ['explicitly']}"
],
"code_language": "python",
"url": "https://github.com/Halvani/Constituent-Treelib",
"thumb": "https://github.com/Halvani/Constituent-Treelib/blob/main/assets/images/promo_tree.svg",
"author": "Oren Halvani",
"author_links": {
"github": "Halvani",
"website": "https://www.linkedin.com/in/orenhalvani"
},
"category": ["apis", "standalone", "visualizers"],
"tags": ["apis", "deployment", "constituency ", "parsing"]
},
{
"id": "sayswho",
@ -4537,6 +4571,33 @@
},
"category": ["pipeline"],
"tags": ["tokenizer", "french"]
},
{
"id": "gliner-spacy",
"title": "GLiNER spaCy Wrapper",
"slogan": "Integrating GLiNER's Advanced NER with spaCy",
"description": "GLiNER SpaCy Wrapper is a project that brings together GLiNER, a zero-shot Named Entity Recognition (NER) model, with spaCy's NLP capabilities. It provides an easy way to integrate GLiNER within the spaCy environment, thus enhancing NER tasks with GLiNER's features.",
"github": "theirstory/gliner-spacy",
"pip": "gliner-spacy",
"code_example": [
"import spacy",
"",
"nlp = spacy.blank('en')",
"nlp.add_pipe('gliner_spacy')",
"text = 'This is a text about Bill Gates and Microsoft.'",
"doc = nlp(text)",
"",
"for ent in doc.ents:",
" print(ent.text, ent.label_)"
],
"code_language": "python",
"url": "https://github.com/theirstory/gliner-spacy",
"author": "TheirStory",
"author_links": {
"website": "https://theirstory.io"
},
"category": ["pipeline"],
"tags": ["NER"]
}
],

View File

@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
<Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
💥 Interested in <strong>Premium spaCy Models</strong>?
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
💥 <strong>New:</strong> Case study with S&P Global
</Link>
)