mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 01:21:58 +03:00
Merge branch 'master' into pr/13515
This commit is contained in:
commit
5cbe621cde
92
.github/workflows/cibuildwheel.yml
vendored
Normal file
92
.github/workflows/cibuildwheel.yml
vendored
Normal file
|
@ -0,0 +1,92 @@
|
|||
name: Build
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
# ytf did they invent their own syntax that's almost regex?
|
||||
# ** matches 'zero or more of any character'
|
||||
- 'release-v[0-9]+.[0-9]+.[0-9]+**'
|
||||
- 'prerelease-v[0-9]+.[0-9]+.[0-9]+**'
|
||||
jobs:
|
||||
build_wheels:
|
||||
name: Build wheels on ${{ matrix.os }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
# macos-13 is an intel runner, macos-14 is apple silicon
|
||||
os: [ubuntu-latest, windows-latest, macos-13]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Build wheels
|
||||
uses: pypa/cibuildwheel@v2.19.1
|
||||
env:
|
||||
CIBW_SOME_OPTION: value
|
||||
with:
|
||||
package-dir: .
|
||||
output-dir: wheelhouse
|
||||
config-file: "{package}/pyproject.toml"
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}
|
||||
path: ./wheelhouse/*.whl
|
||||
|
||||
build_sdist:
|
||||
name: Build source distribution
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Build sdist
|
||||
run: pipx run build --sdist
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cibw-sdist
|
||||
path: dist/*.tar.gz
|
||||
create_release:
|
||||
needs: [build_wheels, build_sdist]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write
|
||||
checks: write
|
||||
actions: read
|
||||
issues: read
|
||||
packages: write
|
||||
pull-requests: read
|
||||
repository-projects: read
|
||||
statuses: read
|
||||
steps:
|
||||
- name: Get the tag name and determine if it's a prerelease
|
||||
id: get_tag_info
|
||||
run: |
|
||||
FULL_TAG=${GITHUB_REF#refs/tags/}
|
||||
if [[ $FULL_TAG == release-* ]]; then
|
||||
TAG_NAME=${FULL_TAG#release-}
|
||||
IS_PRERELEASE=false
|
||||
elif [[ $FULL_TAG == prerelease-* ]]; then
|
||||
TAG_NAME=${FULL_TAG#prerelease-}
|
||||
IS_PRERELEASE=true
|
||||
else
|
||||
echo "Tag does not match expected patterns" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV
|
||||
echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV
|
||||
echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
# unpacks all CIBW artifacts into dist/
|
||||
pattern: cibw-*
|
||||
path: dist
|
||||
merge-multiple: true
|
||||
- name: Create Draft Release
|
||||
id: create_release
|
||||
uses: softprops/action-gh-release@v2
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
name: ${{ env.TAG_NAME }}
|
||||
draft: true
|
||||
prerelease: ${{ env.IS_PRERELEASE }}
|
||||
files: "./dist/*"
|
29
.github/workflows/publish_pypi.yml
vendored
Normal file
29
.github/workflows/publish_pypi.yml
vendored
Normal file
|
@ -0,0 +1,29 @@
|
|||
# The cibuildwheel action triggers on creation of a release, this
|
||||
# triggers on publication.
|
||||
# The expected workflow is to create a draft release and let the wheels
|
||||
# upload, and then hit 'publish', which uploads to PyPi.
|
||||
|
||||
on:
|
||||
release:
|
||||
types:
|
||||
- published
|
||||
|
||||
jobs:
|
||||
upload_pypi:
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/spacy
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name == 'release' && github.event.action == 'published'
|
||||
# or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this)
|
||||
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: robinraju/release-downloader@v1
|
||||
with:
|
||||
tag: ${{ github.event.release.tag_name }}
|
||||
fileName: '*'
|
||||
out-file-path: 'dist'
|
||||
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@ -11,5 +11,58 @@ requires = [
|
|||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.cibuildwheel]
|
||||
build = "*"
|
||||
skip = "pp* cp36* cp37* cp38* *-win32"
|
||||
test-skip = ""
|
||||
free-threaded-support = false
|
||||
|
||||
archs = ["native"]
|
||||
|
||||
build-frontend = "default"
|
||||
config-settings = {}
|
||||
dependency-versions = "pinned"
|
||||
environment = { PIP_CONSTRAINT = "build-constraints.txt" }
|
||||
|
||||
environment-pass = []
|
||||
build-verbosity = 0
|
||||
|
||||
before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable"
|
||||
before-build = "pip install -r requirements.txt && python setup.py clean"
|
||||
repair-wheel-command = ""
|
||||
|
||||
test-command = ""
|
||||
before-test = ""
|
||||
test-requires = []
|
||||
test-extras = []
|
||||
|
||||
container-engine = "docker"
|
||||
|
||||
manylinux-x86_64-image = "manylinux2014"
|
||||
manylinux-i686-image = "manylinux2014"
|
||||
manylinux-aarch64-image = "manylinux2014"
|
||||
manylinux-ppc64le-image = "manylinux2014"
|
||||
manylinux-s390x-image = "manylinux2014"
|
||||
manylinux-pypy_x86_64-image = "manylinux2014"
|
||||
manylinux-pypy_i686-image = "manylinux2014"
|
||||
manylinux-pypy_aarch64-image = "manylinux2014"
|
||||
|
||||
musllinux-x86_64-image = "musllinux_1_2"
|
||||
musllinux-i686-image = "musllinux_1_2"
|
||||
musllinux-aarch64-image = "musllinux_1_2"
|
||||
musllinux-ppc64le-image = "musllinux_1_2"
|
||||
musllinux-s390x-image = "musllinux_1_2"
|
||||
|
||||
[tool.cibuildwheel.linux]
|
||||
repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}"
|
||||
|
||||
[tool.cibuildwheel.macos]
|
||||
repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
|
||||
|
||||
[tool.cibuildwheel.windows]
|
||||
|
||||
[tool.cibuildwheel.pyodide]
|
||||
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
|
|
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
# Development dependencies
|
||||
pre-commit>=2.13.0
|
||||
cython>=0.25,<3.0
|
||||
|
|
|
@ -66,7 +66,6 @@ install_requires =
|
|||
# Official Python utilities
|
||||
setuptools
|
||||
packaging>=20.0
|
||||
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
|
||||
langcodes>=3.2.0,<4.0.0
|
||||
|
||||
[options.entry_points]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.7.5"
|
||||
__version__ = "3.8.0.dev0"
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
16
spacy/lang/bo/__init__.py
Normal file
16
spacy/lang/bo/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class TibetanDefaults(BaseDefaults):
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Tibetan(Language):
|
||||
lang = "bo"
|
||||
Defaults = TibetanDefaults
|
||||
|
||||
|
||||
__all__ = ["Tibetan"]
|
16
spacy/lang/bo/examples.py
Normal file
16
spacy/lang/bo/examples.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.bo.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།",
|
||||
"ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག",
|
||||
"སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།",
|
||||
"རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།",
|
||||
"གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།",
|
||||
"ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།",
|
||||
]
|
65
spacy/lang/bo/lex_attrs.py
Normal file
65
spacy/lang/bo/lex_attrs.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals
|
||||
|
||||
_num_words = [
|
||||
"ཀླད་ཀོར་",
|
||||
"གཅིག་",
|
||||
"གཉིས་",
|
||||
"གསུམ་",
|
||||
"བཞི་",
|
||||
"ལྔ་",
|
||||
"དྲུག་",
|
||||
"བདུན་",
|
||||
"བརྒྱད་",
|
||||
"དགུ་",
|
||||
"བཅུ་",
|
||||
"བཅུ་གཅིག་",
|
||||
"བཅུ་གཉིས་",
|
||||
"བཅུ་གསུམ་",
|
||||
"བཅུ་བཞི་",
|
||||
"བཅུ་ལྔ་",
|
||||
"བཅུ་དྲུག་",
|
||||
"བཅུ་བདུན་",
|
||||
"བཅུ་པརྒྱད",
|
||||
"བཅུ་དགུ་",
|
||||
"ཉི་ཤུ་",
|
||||
"སུམ་ཅུ",
|
||||
"བཞི་བཅུ",
|
||||
"ལྔ་བཅུ",
|
||||
"དྲུག་ཅུ",
|
||||
"བདུན་ཅུ",
|
||||
"བརྒྱད་ཅུ",
|
||||
"དགུ་བཅུ",
|
||||
"བརྒྱ་",
|
||||
"སྟོང་",
|
||||
"ཁྲི་",
|
||||
"ས་ཡ་",
|
||||
" བྱེ་བ་",
|
||||
"དུང་ཕྱུར་",
|
||||
"ཐེར་འབུམ་",
|
||||
"ཐེར་འབུམ་ཆེན་པོ་",
|
||||
"ཁྲག་ཁྲིག་",
|
||||
"ཁྲག་ཁྲིག་ཆེན་པོ་",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
"""
|
||||
Check if text resembles a number
|
||||
"""
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
198
spacy/lang/bo/stop_words.py
Normal file
198
spacy/lang/bo/stop_words.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
# Source: https://zenodo.org/records/10148636
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
འི་
|
||||
།
|
||||
དུ་
|
||||
གིས་
|
||||
སོགས་
|
||||
ཏེ
|
||||
གི་
|
||||
རྣམས་
|
||||
ནི
|
||||
ཀུན་
|
||||
ཡི་
|
||||
འདི
|
||||
ཀྱི་
|
||||
སྙེད་
|
||||
པས་
|
||||
གཞན་
|
||||
ཀྱིས་
|
||||
ཡི
|
||||
ལ
|
||||
ནི་
|
||||
དང་
|
||||
སོགས
|
||||
ཅིང་
|
||||
ར
|
||||
དུ
|
||||
མི་
|
||||
སུ་
|
||||
བཅས་
|
||||
ཡོངས་
|
||||
ལས
|
||||
ཙམ་
|
||||
གྱིས་
|
||||
དེ་
|
||||
ཡང་
|
||||
མཐའ་དག་
|
||||
ཏུ་
|
||||
ཉིད་
|
||||
ས
|
||||
ཏེ་
|
||||
གྱི་
|
||||
སྤྱི
|
||||
དེ
|
||||
ཀ་
|
||||
ཡིན་
|
||||
ཞིང་
|
||||
འདི་
|
||||
རུང་
|
||||
རང་
|
||||
ཞིག་
|
||||
སྟེ
|
||||
སྟེ་
|
||||
ན་རེ
|
||||
ངམ
|
||||
ཤིང་
|
||||
དག་
|
||||
ཏོ
|
||||
རེ་
|
||||
འང་
|
||||
ཀྱང་
|
||||
ལགས་པ
|
||||
ཚུ
|
||||
དོ
|
||||
ཡིན་པ
|
||||
རེ
|
||||
ན་རེ་
|
||||
ཨེ་
|
||||
ཚང་མ
|
||||
ཐམས་ཅད་
|
||||
དམ་
|
||||
འོ་
|
||||
ཅིག་
|
||||
གྱིན་
|
||||
ཡིན
|
||||
ན
|
||||
ཁོ་ན་
|
||||
འམ་
|
||||
ཀྱིན་
|
||||
ལོ
|
||||
ཀྱིས
|
||||
བས་
|
||||
ལགས་
|
||||
ཤིག
|
||||
གིས
|
||||
ཀི་
|
||||
སྣ་ཚོགས་
|
||||
རྣམས
|
||||
སྙེད་པ
|
||||
ཡིས་
|
||||
གྱི
|
||||
གི
|
||||
བམ་
|
||||
ཤིག་
|
||||
རེ་རེ་
|
||||
ནམ
|
||||
མིན་
|
||||
ནམ་
|
||||
ངམ་
|
||||
རུ་
|
||||
འགའ་
|
||||
ཀུན
|
||||
ཤས་
|
||||
ཏུ
|
||||
ཡིས
|
||||
གིན་
|
||||
གམ་
|
||||
འོ
|
||||
ཡིན་པ་
|
||||
མིན
|
||||
ལགས
|
||||
གྱིས
|
||||
ཅང་
|
||||
འགའ
|
||||
སམ་
|
||||
ཞིག
|
||||
འང
|
||||
ལས་ཆེ་
|
||||
འཕྲལ་
|
||||
བར་
|
||||
རུ
|
||||
དང
|
||||
ཡ
|
||||
འག
|
||||
སམ
|
||||
ཀ
|
||||
ཅུང་ཟད་
|
||||
ཅིག
|
||||
ཉིད
|
||||
དུ་མ
|
||||
མ
|
||||
ཡིན་བ
|
||||
འམ
|
||||
མམ
|
||||
དམ
|
||||
དག
|
||||
ཁོ་ན
|
||||
ཀྱི
|
||||
ལམ
|
||||
ཕྱི་
|
||||
ནང་
|
||||
ཙམ
|
||||
ནོ་
|
||||
སོ་
|
||||
རམ་
|
||||
བོ་
|
||||
ཨང་
|
||||
ཕྱི
|
||||
ཏོ་
|
||||
ཚོ
|
||||
ལ་ལ་
|
||||
ཚོ་
|
||||
ཅིང
|
||||
མ་གི་
|
||||
གེ
|
||||
གོ
|
||||
ཡིན་ལུགས་
|
||||
རོ་
|
||||
བོ
|
||||
ལགས་པ་
|
||||
པས
|
||||
རབ་
|
||||
འི
|
||||
རམ
|
||||
བས
|
||||
གཞན
|
||||
སྙེད་པ་
|
||||
འབའ་
|
||||
མཾ་
|
||||
པོ
|
||||
ག་
|
||||
ག
|
||||
གམ
|
||||
སྤྱི་
|
||||
བམ
|
||||
མོ་
|
||||
ཙམ་པ་
|
||||
ཤ་སྟག་
|
||||
མམ་
|
||||
རེ་རེ
|
||||
སྙེད
|
||||
ཏམ་
|
||||
ངོ
|
||||
གྲང་
|
||||
ཏ་རེ
|
||||
ཏམ
|
||||
ཁ་
|
||||
ངེ་
|
||||
ཅོག་
|
||||
རིལ་
|
||||
ཉུང་ཤས་
|
||||
གིང་
|
||||
ཚ་
|
||||
ཀྱང
|
||||
""".split()
|
||||
)
|
18
spacy/lang/gd/__init__.py
Normal file
18
spacy/lang/gd/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
from typing import Optional
|
||||
|
||||
from ...language import BaseDefaults, Language
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
class ScottishDefaults(BaseDefaults):
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
|
||||
class Scottish(Language):
|
||||
lang = "gd"
|
||||
Defaults = ScottishDefaults
|
||||
|
||||
|
||||
__all__ = ["Scottish"]
|
388
spacy/lang/gd/stop_words.py
Normal file
388
spacy/lang/gd/stop_words.py
Normal file
|
@ -0,0 +1,388 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
'ad
|
||||
'ar
|
||||
'd # iad
|
||||
'g # ag
|
||||
'ga
|
||||
'gam
|
||||
'gan
|
||||
'gar
|
||||
'gur
|
||||
'm # am
|
||||
'n # an
|
||||
'n seo
|
||||
'na
|
||||
'nad
|
||||
'nam
|
||||
'nan
|
||||
'nar
|
||||
'nuair
|
||||
'nur
|
||||
's
|
||||
'sa
|
||||
'san
|
||||
'sann
|
||||
'se
|
||||
'sna
|
||||
a
|
||||
a'
|
||||
a'd # agad
|
||||
a'm # agam
|
||||
a-chèile
|
||||
a-seo
|
||||
a-sin
|
||||
a-siud
|
||||
a chionn
|
||||
a chionn 's
|
||||
a chèile
|
||||
a chéile
|
||||
a dh'
|
||||
a h-uile
|
||||
a seo
|
||||
ac' # aca
|
||||
aca
|
||||
aca-san
|
||||
acasan
|
||||
ach
|
||||
ag
|
||||
agad
|
||||
agad-sa
|
||||
agads'
|
||||
agadsa
|
||||
agaibh
|
||||
agaibhse
|
||||
againn
|
||||
againne
|
||||
agam
|
||||
agam-sa
|
||||
agams'
|
||||
agamsa
|
||||
agus
|
||||
aice
|
||||
aice-se
|
||||
aicese
|
||||
aig
|
||||
aig' # aige
|
||||
aige
|
||||
aige-san
|
||||
aigesan
|
||||
air
|
||||
air-san
|
||||
air neo
|
||||
airsan
|
||||
am
|
||||
an
|
||||
an seo
|
||||
an sin
|
||||
an siud
|
||||
an uair
|
||||
ann
|
||||
ann a
|
||||
ann a'
|
||||
ann a shin
|
||||
ann am
|
||||
ann an
|
||||
annad
|
||||
annam
|
||||
annam-s'
|
||||
annamsa
|
||||
anns
|
||||
anns an
|
||||
annta
|
||||
aon
|
||||
ar
|
||||
as
|
||||
asad
|
||||
asda
|
||||
asta
|
||||
b'
|
||||
bho
|
||||
bhon
|
||||
bhuaidhe # bhuaithe
|
||||
bhuainn
|
||||
bhuaipe
|
||||
bhuaithe
|
||||
bhuapa
|
||||
bhur
|
||||
brì
|
||||
bu
|
||||
c'à
|
||||
car son
|
||||
carson
|
||||
cha
|
||||
chan
|
||||
chionn
|
||||
choir
|
||||
chon
|
||||
chun
|
||||
chèile
|
||||
chéile
|
||||
chòir
|
||||
cia mheud
|
||||
ciamar
|
||||
co-dhiubh
|
||||
cuide
|
||||
cuin
|
||||
cuin'
|
||||
cuine
|
||||
cà
|
||||
cà'
|
||||
càil
|
||||
càit
|
||||
càit'
|
||||
càite
|
||||
cò
|
||||
cò mheud
|
||||
có
|
||||
d'
|
||||
da
|
||||
de
|
||||
dh'
|
||||
dha
|
||||
dhaibh
|
||||
dhaibh-san
|
||||
dhaibhsan
|
||||
dhan
|
||||
dhasan
|
||||
dhe
|
||||
dhen
|
||||
dheth
|
||||
dhi
|
||||
dhiom
|
||||
dhiot
|
||||
dhith
|
||||
dhiubh
|
||||
dhomh
|
||||
dhomh-s'
|
||||
dhomhsa
|
||||
dhu'sa # dhut-sa
|
||||
dhuibh
|
||||
dhuibhse
|
||||
dhuinn
|
||||
dhuinne
|
||||
dhuit
|
||||
dhut
|
||||
dhutsa
|
||||
dhut-sa
|
||||
dhà
|
||||
dhà-san
|
||||
dhàsan
|
||||
dhòmhsa
|
||||
diubh
|
||||
do
|
||||
docha
|
||||
don
|
||||
dà
|
||||
dè
|
||||
dè mar
|
||||
dé
|
||||
dé mar
|
||||
dòch'
|
||||
dòcha
|
||||
e
|
||||
eadar
|
||||
eatarra
|
||||
eatorra
|
||||
eile
|
||||
esan
|
||||
fa
|
||||
far
|
||||
feud
|
||||
fhad
|
||||
fheudar
|
||||
fhearr
|
||||
fhein
|
||||
fheudar
|
||||
fheàrr
|
||||
fhèin
|
||||
fhéin
|
||||
fhìn
|
||||
fo
|
||||
fodha
|
||||
fodhainn
|
||||
foipe
|
||||
fon
|
||||
fèin
|
||||
ga
|
||||
gach
|
||||
gam
|
||||
gan
|
||||
ge brith
|
||||
ged
|
||||
gu
|
||||
gu dè
|
||||
gu ruige
|
||||
gun
|
||||
gur
|
||||
gus
|
||||
i
|
||||
iad
|
||||
iadsan
|
||||
innte
|
||||
is
|
||||
ise
|
||||
le
|
||||
leam
|
||||
leam-sa
|
||||
leamsa
|
||||
leat
|
||||
leat-sa
|
||||
leatha
|
||||
leatsa
|
||||
leibh
|
||||
leis
|
||||
leis-san
|
||||
leoth'
|
||||
leotha
|
||||
leotha-san
|
||||
linn
|
||||
m'
|
||||
m'a
|
||||
ma
|
||||
mac
|
||||
man
|
||||
mar
|
||||
mas
|
||||
mathaid
|
||||
mi
|
||||
mis'
|
||||
mise
|
||||
mo
|
||||
mu
|
||||
mu 'n
|
||||
mun
|
||||
mur
|
||||
mura
|
||||
mus
|
||||
na
|
||||
na b'
|
||||
na bu
|
||||
na iad
|
||||
nach
|
||||
nad
|
||||
nam
|
||||
nan
|
||||
nar
|
||||
nas
|
||||
neo
|
||||
no
|
||||
nuair
|
||||
o
|
||||
o'n
|
||||
oir
|
||||
oirbh
|
||||
oirbh-se
|
||||
oirnn
|
||||
oirnne
|
||||
oirre
|
||||
on
|
||||
orm
|
||||
orm-sa
|
||||
ormsa
|
||||
orra
|
||||
orra-san
|
||||
orrasan
|
||||
ort
|
||||
os
|
||||
r'
|
||||
ri
|
||||
ribh
|
||||
rinn
|
||||
ris
|
||||
rithe
|
||||
rithe-se
|
||||
rium
|
||||
rium-sa
|
||||
riums'
|
||||
riumsa
|
||||
riut
|
||||
riuth'
|
||||
riutha
|
||||
riuthasan
|
||||
ro
|
||||
ro'n
|
||||
roimh
|
||||
roimhe
|
||||
romhainn
|
||||
romham
|
||||
romhpa
|
||||
ron
|
||||
ruibh
|
||||
ruinn
|
||||
ruinne
|
||||
sa
|
||||
san
|
||||
sann
|
||||
se
|
||||
seach
|
||||
seo
|
||||
seothach
|
||||
shin
|
||||
sibh
|
||||
sibh-se
|
||||
sibhse
|
||||
sin
|
||||
sineach
|
||||
sinn
|
||||
sinne
|
||||
siod
|
||||
siodach
|
||||
siud
|
||||
siudach
|
||||
sna # ann an
|
||||
sè
|
||||
t'
|
||||
tarsaing
|
||||
tarsainn
|
||||
tarsuinn
|
||||
thar
|
||||
thoigh
|
||||
thro
|
||||
thu
|
||||
thuc'
|
||||
thuca
|
||||
thugad
|
||||
thugaibh
|
||||
thugainn
|
||||
thugam
|
||||
thugamsa
|
||||
thuice
|
||||
thuige
|
||||
thus'
|
||||
thusa
|
||||
timcheall
|
||||
toigh
|
||||
toil
|
||||
tro
|
||||
tro' # troimh
|
||||
troimh
|
||||
troimhe
|
||||
tron
|
||||
tu
|
||||
tusa
|
||||
uair
|
||||
ud
|
||||
ugaibh
|
||||
ugam-s'
|
||||
ugam-sa
|
||||
uice
|
||||
uige
|
||||
uige-san
|
||||
umad
|
||||
unnta # ann an
|
||||
ur
|
||||
urrainn
|
||||
à
|
||||
às
|
||||
àsan
|
||||
á
|
||||
ás
|
||||
è
|
||||
ì
|
||||
ò
|
||||
ó
|
||||
""".split(
|
||||
"\n"
|
||||
)
|
||||
)
|
1983
spacy/lang/gd/tokenizer_exceptions.py
Normal file
1983
spacy/lang/gd/tokenizer_exceptions.py
Normal file
File diff suppressed because it is too large
Load Diff
16
spacy/lang/kmr/__init__.py
Normal file
16
spacy/lang/kmr/__init__.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
from ...language import BaseDefaults, Language
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
class KurmanjiDefaults(BaseDefaults):
|
||||
stop_words = STOP_WORDS
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
|
||||
|
||||
class Kurmanji(Language):
|
||||
lang = "kmr"
|
||||
Defaults = KurmanjiDefaults
|
||||
|
||||
|
||||
__all__ = ["Kurmanji"]
|
17
spacy/lang/kmr/examples.py
Normal file
17
spacy/lang/kmr/examples.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
>>> from spacy.lang.kmr.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
sentences = [
|
||||
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
|
||||
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
|
||||
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
|
||||
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
|
||||
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
|
||||
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
|
||||
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
|
||||
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
|
||||
]
|
138
spacy/lang/kmr/lex_attrs.py
Normal file
138
spacy/lang/kmr/lex_attrs.py
Normal file
|
@ -0,0 +1,138 @@
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"sifir",
|
||||
"yek",
|
||||
"du",
|
||||
"sê",
|
||||
"çar",
|
||||
"pênc",
|
||||
"şeş",
|
||||
"heft",
|
||||
"heşt",
|
||||
"neh",
|
||||
"deh",
|
||||
"yazde",
|
||||
"dazde",
|
||||
"sêzde",
|
||||
"çarde",
|
||||
"pazde",
|
||||
"şazde",
|
||||
"hevde",
|
||||
"hejde",
|
||||
"nozde",
|
||||
"bîst",
|
||||
"sî",
|
||||
"çil",
|
||||
"pêncî",
|
||||
"şêst",
|
||||
"heftê",
|
||||
"heştê",
|
||||
"nod",
|
||||
"sed",
|
||||
"hezar",
|
||||
"milyon",
|
||||
"milyar",
|
||||
]
|
||||
|
||||
_ordinal_words = [
|
||||
"yekem",
|
||||
"yekemîn",
|
||||
"duyem",
|
||||
"duyemîn",
|
||||
"sêyem",
|
||||
"sêyemîn",
|
||||
"çarem",
|
||||
"çaremîn",
|
||||
"pêncem",
|
||||
"pêncemîn",
|
||||
"şeşem",
|
||||
"şeşemîn",
|
||||
"heftem",
|
||||
"heftemîn",
|
||||
"heştem",
|
||||
"heştemîn",
|
||||
"nehem",
|
||||
"nehemîn",
|
||||
"dehem",
|
||||
"dehemîn",
|
||||
"yazdehem",
|
||||
"yazdehemîn",
|
||||
"dazdehem",
|
||||
"dazdehemîn",
|
||||
"sêzdehem",
|
||||
"sêzdehemîn",
|
||||
"çardehem",
|
||||
"çardehemîn",
|
||||
"pazdehem",
|
||||
"pazdehemîn",
|
||||
"şanzdehem",
|
||||
"şanzdehemîn",
|
||||
"hevdehem",
|
||||
"hevdehemîn",
|
||||
"hejdehem",
|
||||
"hejdehemîn",
|
||||
"nozdehem",
|
||||
"nozdehemîn",
|
||||
"bîstem",
|
||||
"bîstemîn",
|
||||
"sîyem",
|
||||
"sîyemîn",
|
||||
"çilem",
|
||||
"çilemîn",
|
||||
"pêncîyem",
|
||||
"pênciyemîn",
|
||||
"şêstem",
|
||||
"şêstemîn",
|
||||
"heftêyem",
|
||||
"heftêyemîn",
|
||||
"heştêyem",
|
||||
"heştêyemîn",
|
||||
"notem",
|
||||
"notemîn",
|
||||
"sedem",
|
||||
"sedemîn",
|
||||
"hezarem",
|
||||
"hezaremîn",
|
||||
"milyonem",
|
||||
"milyonemîn",
|
||||
"milyarem",
|
||||
"milyaremîn",
|
||||
]
|
||||
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
|
||||
# Check ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
|
||||
if is_digit(text_lower):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_digit(text):
|
||||
endings = ("em", "yem", "emîn", "yemîn")
|
||||
for ending in endings:
|
||||
to = len(ending)
|
||||
if text.endswith(ending) and text[:-to].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
44
spacy/lang/kmr/stop_words.py
Normal file
44
spacy/lang/kmr/stop_words.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
STOP_WORDS = set(
|
||||
"""
|
||||
û
|
||||
li
|
||||
bi
|
||||
di
|
||||
da
|
||||
de
|
||||
ji
|
||||
ku
|
||||
ew
|
||||
ez
|
||||
tu
|
||||
em
|
||||
hûn
|
||||
ew
|
||||
ev
|
||||
min
|
||||
te
|
||||
wî
|
||||
wê
|
||||
me
|
||||
we
|
||||
wan
|
||||
vê
|
||||
vî
|
||||
va
|
||||
çi
|
||||
kî
|
||||
kê
|
||||
çawa
|
||||
çima
|
||||
kengî
|
||||
li ku
|
||||
çend
|
||||
çiqas
|
||||
her
|
||||
hin
|
||||
gelek
|
||||
hemû
|
||||
kes
|
||||
tişt
|
||||
""".split()
|
||||
)
|
|
@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults):
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
if lookups is None:
|
||||
lookups = Lookups()
|
||||
return MacedonianLemmatizer(lookups)
|
||||
|
||||
|
||||
class Macedonian(Language):
|
||||
lang = "mk"
|
||||
Defaults = MacedonianDefaults
|
||||
|
|
|
@ -5,7 +5,7 @@ import multiprocessing as mp
|
|||
import random
|
||||
import traceback
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from copy import deepcopy
|
||||
from dataclasses import dataclass
|
||||
from itertools import chain, cycle
|
||||
|
@ -31,6 +31,7 @@ from typing import (
|
|||
)
|
||||
|
||||
import srsly
|
||||
from cymem.cymem import Pool
|
||||
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
||||
|
||||
from . import about, ty, util
|
||||
|
@ -2091,6 +2092,38 @@ class Language:
|
|||
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
||||
tok2vec.remove_listener(listener, pipe_name)
|
||||
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||
"""Begin a block where all resources allocated during the block will
|
||||
be freed at the end of it. If a resources was created within the
|
||||
memory zone block, accessing it outside the block is invalid.
|
||||
Behaviour of this invalid access is undefined. Memory zones should
|
||||
not be nested.
|
||||
|
||||
The memory zone is helpful for services that need to process large
|
||||
volumes of text with a defined memory budget.
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> with nlp.memory_zone():
|
||||
... for doc in nlp.pipe(texts):
|
||||
... process_my_doc(doc)
|
||||
>>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone
|
||||
"""
|
||||
if mem is None:
|
||||
mem = Pool()
|
||||
# The ExitStack allows programmatic nested context managers.
|
||||
# We don't know how many we need, so it would be awkward to have
|
||||
# them as nested blocks.
|
||||
with ExitStack() as stack:
|
||||
contexts = [stack.enter_context(self.vocab.memory_zone(mem))]
|
||||
if hasattr(self.tokenizer, "memory_zone"):
|
||||
contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem)))
|
||||
for _, pipe in self.pipeline:
|
||||
if hasattr(pipe, "memory_zone"):
|
||||
contexts.append(stack.enter_context(pipe.memory_zone(mem)))
|
||||
yield mem
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
|
|
|
@ -203,7 +203,7 @@ cdef class ArcEagerGold:
|
|||
def __init__(self, ArcEager moves, StateClass stcls, Example example):
|
||||
self.mem = Pool()
|
||||
heads, labels = example.get_aligned_parse(projectivize=True)
|
||||
labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
|
||||
labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels]
|
||||
sent_starts = _get_aligned_sent_starts(example)
|
||||
assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
|
||||
self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
|
||||
|
|
|
@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc):
|
|||
new_label, head_label = label.split(DELIMITER)
|
||||
new_head = _find_new_head(doc[i], head_label)
|
||||
doc.c[i].head = new_head.i - i
|
||||
doc.c[i].dep = doc.vocab.strings.add(new_label)
|
||||
doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False)
|
||||
set_children_from_heads(doc.c, 0, doc.length)
|
||||
return doc
|
||||
|
||||
|
|
|
@ -25,5 +25,7 @@ cdef class StringStore:
|
|||
cdef vector[hash_t] keys
|
||||
cdef public PreshMap _map
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient)
|
||||
cdef vector[hash_t] _transient_keys
|
||||
cdef Pool _non_temp_mem
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
# cython: infer_types=True
|
||||
# cython: profile=False
|
||||
cimport cython
|
||||
|
||||
from contextlib import contextmanager
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from murmurhash.mrmr cimport hash32, hash64
|
||||
from preshed.maps cimport map_clear
|
||||
|
||||
import srsly
|
||||
|
||||
|
@ -31,7 +36,7 @@ def get_string_id(key):
|
|||
This function optimises for convenience over performance, so shouldn't be
|
||||
used in tight loops.
|
||||
"""
|
||||
cdef hash_t str_hash
|
||||
cdef hash_t str_hash
|
||||
if isinstance(key, str):
|
||||
if len(key) == 0:
|
||||
return 0
|
||||
|
@ -45,8 +50,8 @@ def get_string_id(key):
|
|||
elif _try_coerce_to_hash(key, &str_hash):
|
||||
# Coerce the integral key to the expected primitive hash type.
|
||||
# This ensures that custom/overloaded "primitive" data types
|
||||
# such as those implemented by numpy are not inadvertently used
|
||||
# downsteam (as these are internally implemented as custom PyObjects
|
||||
# such as those implemented by numpy are not inadvertently used
|
||||
# downsteam (as these are internally implemented as custom PyObjects
|
||||
# whose comparison operators can incur a significant overhead).
|
||||
return str_hash
|
||||
else:
|
||||
|
@ -119,10 +124,11 @@ cdef class StringStore:
|
|||
strings (iterable): A sequence of unicode strings to add to the store.
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._non_temp_mem = self.mem
|
||||
self._map = PreshMap()
|
||||
if strings is not None:
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
self.add(string, allow_transient=False)
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given hash, or vice versa.
|
||||
|
@ -152,14 +158,17 @@ cdef class StringStore:
|
|||
return SYMBOLS_BY_INT[str_hash]
|
||||
else:
|
||||
utf8str = <Utf8Str*>self._map.get(str_hash)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
utf8str = <Utf8Str*>self._map.get(string_or_id)
|
||||
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
|
||||
def as_int(self, key):
|
||||
"""If key is an int, return it; otherwise, get the int value."""
|
||||
|
@ -175,12 +184,46 @@ cdef class StringStore:
|
|||
else:
|
||||
return self[key]
|
||||
|
||||
def add(self, string):
|
||||
def __len__(self) -> int:
|
||||
"""The number of strings in the store.
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.keys.size() + self._transient_keys.size()
|
||||
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
|
||||
"""Begin a block where all resources allocated during the block will
|
||||
be freed at the end of it. If a resources was created within the
|
||||
memory zone block, accessing it outside the block is invalid.
|
||||
Behaviour of this invalid access is undefined. Memory zones should
|
||||
not be nested.
|
||||
|
||||
The memory zone is helpful for services that need to process large
|
||||
volumes of text with a defined memory budget.
|
||||
"""
|
||||
if mem is None:
|
||||
mem = Pool()
|
||||
self.mem = mem
|
||||
yield mem
|
||||
for key in self._transient_keys:
|
||||
map_clear(self._map.c_map, key)
|
||||
self._transient_keys.clear()
|
||||
self.mem = self._non_temp_mem
|
||||
|
||||
def add(self, string: str, allow_transient: Optional[bool] = None) -> int:
|
||||
"""Add a string to the StringStore.
|
||||
|
||||
string (str): The string to add.
|
||||
allow_transient (bool): Allow the string to be stored in the 'transient'
|
||||
map, which will be flushed at the end of the memory zone. Strings
|
||||
encountered during arbitrary text processing should be added
|
||||
with allow_transient=True, while labels and other strings used
|
||||
internally should not.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
if allow_transient is None:
|
||||
allow_transient = self.mem is not self._non_temp_mem
|
||||
cdef hash_t str_hash
|
||||
if isinstance(string, str):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
|
@ -188,22 +231,26 @@ cdef class StringStore:
|
|||
|
||||
string = string.encode("utf8")
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||
elif isinstance(string, bytes):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
str_hash = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string), &str_hash)
|
||||
self._intern_utf8(string, len(string), &str_hash, allow_transient)
|
||||
else:
|
||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||
return str_hash
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
else:
|
||||
return self._intern_str(string, allow_transient)
|
||||
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.keys.size()
|
||||
return self.keys.size() + self._transient_keys.size()
|
||||
|
||||
def __contains__(self, string_or_id not None):
|
||||
"""Check whether a string or ID is in the store.
|
||||
|
@ -222,12 +269,17 @@ cdef class StringStore:
|
|||
pass
|
||||
else:
|
||||
# TODO: Raise an error instead
|
||||
return self._map.get(string_or_id) is not NULL
|
||||
|
||||
if self._map.get(string_or_id) is not NULL:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
if str_hash < len(SYMBOLS_BY_INT):
|
||||
return True
|
||||
else:
|
||||
return self._map.get(str_hash) is not NULL
|
||||
if self._map.get(str_hash) is not NULL:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
@ -240,12 +292,29 @@ cdef class StringStore:
|
|||
key = self.keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
# TODO: Iterate OOV here?
|
||||
for i in range(self._transient_keys.size()):
|
||||
key = self._transient_keys[i]
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
yield decode_Utf8Str(utf8str)
|
||||
|
||||
def __reduce__(self):
|
||||
strings = list(self)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def values(self) -> List[int]:
|
||||
"""Iterate over the stored strings hashes in insertion order.
|
||||
|
||||
RETURNS: A list of string hashs.
|
||||
"""
|
||||
cdef int i
|
||||
hashes = [None] * self._keys.size()
|
||||
for i in range(self._keys.size()):
|
||||
hashes[i] = self._keys[i]
|
||||
transient_hashes = [None] * self._transient_keys.size()
|
||||
for i in range(self._transient_keys.size()):
|
||||
transient_hashes[i] = self._transient_keys[i]
|
||||
return hashes + transient_hashes
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
@ -269,7 +338,7 @@ cdef class StringStore:
|
|||
prev = list(self)
|
||||
self._reset_and_load(strings)
|
||||
for word in prev:
|
||||
self.add(word)
|
||||
self.add(word, allow_transient=False)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
|
@ -289,23 +358,25 @@ cdef class StringStore:
|
|||
prev = list(self)
|
||||
self._reset_and_load(strings)
|
||||
for word in prev:
|
||||
self.add(word)
|
||||
self.add(word, allow_transient=False)
|
||||
return self
|
||||
|
||||
def _reset_and_load(self, strings):
|
||||
self.mem = Pool()
|
||||
self._non_temp_mem = self.mem
|
||||
self._map = PreshMap()
|
||||
self.keys.clear()
|
||||
self._transient_keys.clear()
|
||||
for string in strings:
|
||||
self.add(string)
|
||||
self.add(string, allow_transient=False)
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string):
|
||||
cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode("utf8")
|
||||
return self._intern_utf8(byte_string, len(byte_string), NULL)
|
||||
return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
|
||||
|
||||
@cython.final
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
|
||||
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
|
||||
# TODO: This function's API/behaviour is an unholy mess...
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
|
||||
|
@ -314,5 +385,8 @@ cdef class StringStore:
|
|||
return value
|
||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||
self._map.set(key, value)
|
||||
self.keys.push_back(key)
|
||||
if allow_transient and self.mem is not self._non_temp_mem:
|
||||
self._transient_keys.push_back(key)
|
||||
else:
|
||||
self.keys.push_back(key)
|
||||
return value
|
||||
|
|
|
@ -81,6 +81,11 @@ def bn_tokenizer():
|
|||
return get_lang_class("bn")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def bo_tokenizer():
|
||||
return get_lang_class("bo")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ca_tokenizer():
|
||||
return get_lang_class("ca")().tokenizer
|
||||
|
|
0
spacy/tests/lang/bo/__init__.py
Normal file
0
spacy/tests/lang/bo/__init__.py
Normal file
21
spacy/tests/lang/bo/test_text.py
Normal file
21
spacy/tests/lang/bo/test_text.py
Normal file
|
@ -0,0 +1,21 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("999.0", True),
|
||||
("གཅིག་", True),
|
||||
("གཉིས་", True),
|
||||
("ཀླད་ཀོར་", True),
|
||||
("བཅུ་གཅིག་", True),
|
||||
("ཁྱི་", False),
|
||||
(",", False),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(bo_tokenizer, text, match):
|
||||
tokens = bo_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
0
spacy/tests/lang/kmr/__init__.py
Normal file
0
spacy/tests/lang/kmr/__init__.py
Normal file
27
spacy/tests/lang/kmr/test_text.py
Normal file
27
spacy/tests/lang/kmr/test_text.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
import pytest
|
||||
|
||||
from spacy.lang.kmr.lex_attrs import like_num
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word",
|
||||
[
|
||||
"yekem",
|
||||
"duyemîn",
|
||||
"100em",
|
||||
"dehem",
|
||||
"sedemîn",
|
||||
"34em",
|
||||
"30yem",
|
||||
"20emîn",
|
||||
"50yemîn",
|
||||
],
|
||||
)
|
||||
def test_kmr_lex_attrs_like_number_for_ordinal(word):
|
||||
assert like_num(word)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["deh"])
|
||||
def test_kmr_lex_attrs_capitals(word):
|
||||
assert like_num(word)
|
||||
assert like_num(word.upper())
|
|
@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
|
|||
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
|
||||
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
|
||||
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
|
||||
"tr", "tt", "uk", "ur", "xx", "yo"]
|
||||
"tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ LANGUAGES = [
|
|||
pytest.param("ar", marks=pytest.mark.slow()),
|
||||
pytest.param("bg", marks=pytest.mark.slow()),
|
||||
"bn",
|
||||
pytest.param("bo", marks=pytest.mark.slow()),
|
||||
pytest.param("ca", marks=pytest.mark.slow()),
|
||||
pytest.param("cs", marks=pytest.mark.slow()),
|
||||
pytest.param("da", marks=pytest.mark.slow()),
|
||||
|
@ -57,6 +58,7 @@ LANGUAGES = [
|
|||
pytest.param("tr", marks=pytest.mark.slow()),
|
||||
pytest.param("tt", marks=pytest.mark.slow()),
|
||||
pytest.param("ur", marks=pytest.mark.slow()),
|
||||
pytest.param("kmr", marks=pytest.mark.slow()),
|
||||
]
|
||||
|
||||
|
||||
|
|
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
36
spacy/tests/vocab_vectors/test_memory_zone.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_memory_zone_no_insertion():
|
||||
vocab = Vocab()
|
||||
with vocab.memory_zone():
|
||||
pass
|
||||
lex = vocab["horse"]
|
||||
assert lex.text == "horse"
|
||||
|
||||
|
||||
def test_memory_zone_insertion():
|
||||
vocab = Vocab()
|
||||
_ = vocab["dog"]
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
||||
with vocab.memory_zone():
|
||||
lex = vocab["horse"]
|
||||
assert lex.text == "horse"
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
||||
|
||||
|
||||
def test_memory_zone_redundant_insertion():
|
||||
"""Test that if we insert an already-existing word while
|
||||
in the memory zone, it stays persistent"""
|
||||
vocab = Vocab()
|
||||
_ = vocab["dog"]
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
||||
with vocab.memory_zone():
|
||||
lex = vocab["horse"]
|
||||
assert lex.text == "horse"
|
||||
_ = vocab["dog"]
|
||||
assert "dog" in vocab
|
||||
assert "horse" not in vocab
|
|
@ -25,9 +25,7 @@ cdef class Tokenizer:
|
|||
cdef PhraseMatcher _special_matcher
|
||||
# TODO convert to bool in v4
|
||||
cdef int _faster_heuristics
|
||||
# TODO next one is unused and should be removed in v4
|
||||
# https://github.com/explosion/spaCy/pull/9150
|
||||
cdef int _unused_int2
|
||||
cdef public int max_cache_size
|
||||
|
||||
cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
|
||||
cdef int _apply_special_cases(self, Doc doc) except -1
|
||||
|
|
|
@ -30,7 +30,7 @@ cdef class Tokenizer:
|
|||
"""
|
||||
def __init__(self, Vocab vocab, rules=None, prefix_search=None,
|
||||
suffix_search=None, infix_finditer=None, token_match=None,
|
||||
url_match=None, faster_heuristics=True):
|
||||
url_match=None, faster_heuristics=True, max_cache_size=10000):
|
||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||
|
||||
vocab (Vocab): A storage container for lexical types.
|
||||
|
@ -50,6 +50,7 @@ cdef class Tokenizer:
|
|||
faster_heuristics (bool): Whether to restrict the final
|
||||
Matcher-based pass for rules to those containing affixes or space.
|
||||
Defaults to True.
|
||||
max_cache_size (int): Maximum number of tokenization chunks to cache.
|
||||
|
||||
EXAMPLE:
|
||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||
|
@ -69,6 +70,7 @@ cdef class Tokenizer:
|
|||
self._rules = {}
|
||||
self._special_matcher = PhraseMatcher(self.vocab)
|
||||
self._load_special_cases(rules)
|
||||
self.max_cache_size = max_cache_size
|
||||
|
||||
@property
|
||||
def token_match(self):
|
||||
|
@ -397,8 +399,9 @@ cdef class Tokenizer:
|
|||
has_special, with_special_cases)
|
||||
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
|
||||
with_special_cases)
|
||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||
tokens.length - orig_size)
|
||||
if len(self._cache) < self.max_cache_size:
|
||||
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
|
||||
tokens.length - orig_size)
|
||||
|
||||
cdef str _split_affixes(
|
||||
self,
|
||||
|
@ -514,9 +517,8 @@ cdef class Tokenizer:
|
|||
if n <= 0:
|
||||
# avoid mem alloc of zero length
|
||||
return 0
|
||||
for i in range(n):
|
||||
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
|
||||
return 0
|
||||
if self.vocab.in_memory_zone:
|
||||
return 0
|
||||
# See #1250
|
||||
if has_special[0]:
|
||||
return 0
|
||||
|
|
|
@ -41,7 +41,9 @@ cdef class Vocab:
|
|||
cdef const TokenC* make_fused_token(self, substrings) except NULL
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
|
||||
|
||||
cdef PreshMap _by_orth
|
||||
cdef Pool _non_temp_mem
|
||||
cdef vector[attr_t] _transient_orths
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union
|
||||
|
||||
from cymem.cymem import Pool
|
||||
from thinc.types import Floats1d, FloatsXd
|
||||
|
||||
from . import Language
|
||||
|
@ -67,6 +69,8 @@ class Vocab:
|
|||
def from_bytes(
|
||||
self, bytes_data: bytes, *, exclude: Iterable[str] = ...
|
||||
) -> Vocab: ...
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ...
|
||||
|
||||
def pickle_vocab(vocab: Vocab) -> Any: ...
|
||||
def unpickle_vocab(
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
import functools
|
||||
from contextlib import ExitStack, contextmanager
|
||||
from typing import Iterator, Optional
|
||||
|
||||
import numpy
|
||||
import srsly
|
||||
from thinc.api import get_array_module, get_current_ops
|
||||
from preshed.maps cimport map_clear
|
||||
|
||||
from .attrs cimport LANG, ORTH
|
||||
from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme
|
||||
|
@ -87,6 +90,12 @@ cdef class Vocab:
|
|||
self.lookups = lookups
|
||||
self.writing_system = writing_system
|
||||
self.get_noun_chunks = get_noun_chunks
|
||||
# During a memory_zone we replace our mem object with one
|
||||
# that's passed to us. We keep a reference to our non-temporary
|
||||
# memory here, in case we need to make an allocation we want to
|
||||
# guarantee is not temporary. This is also how we check whether
|
||||
# we're in a memory zone: we check whether self.mem is self._non_temp_mem
|
||||
self._non_temp_mem = self.mem
|
||||
|
||||
@property
|
||||
def vectors(self):
|
||||
|
@ -96,7 +105,7 @@ cdef class Vocab:
|
|||
def vectors(self, vectors):
|
||||
if hasattr(vectors, "strings"):
|
||||
for s in vectors.strings:
|
||||
self.strings.add(s)
|
||||
self.strings.add(s, allow_transient=False)
|
||||
self._vectors = vectors
|
||||
self._vectors.strings = self.strings
|
||||
|
||||
|
@ -107,6 +116,10 @@ cdef class Vocab:
|
|||
langfunc = self.lex_attr_getters.get(LANG, None)
|
||||
return langfunc("_") if langfunc else ""
|
||||
|
||||
@property
|
||||
def in_memory_zone(self) -> bool:
|
||||
return self.mem is not self._non_temp_mem
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored.
|
||||
|
||||
|
@ -114,6 +127,33 @@ cdef class Vocab:
|
|||
"""
|
||||
return self.length
|
||||
|
||||
@contextmanager
|
||||
def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]:
|
||||
"""Begin a block where resources allocated during the block will
|
||||
be freed at the end of it. If a resources was created within the
|
||||
memory zone block, accessing it outside the block is invalid.
|
||||
Behaviour of this invalid access is undefined. Memory zones should
|
||||
not be nested.
|
||||
|
||||
The memory zone is helpful for services that need to process large
|
||||
volumes of text with a defined memory budget.
|
||||
"""
|
||||
if mem is None:
|
||||
mem = Pool()
|
||||
# The ExitStack allows programmatic nested context managers.
|
||||
# We don't know how many we need, so it would be awkward to have
|
||||
# them as nested blocks.
|
||||
with ExitStack() as stack:
|
||||
contexts = [stack.enter_context(self.strings.memory_zone(mem))]
|
||||
if hasattr(self.morphology, "memory_zone"):
|
||||
contexts.append(stack.enter_context(self.morphology.memory_zone(mem)))
|
||||
if hasattr(self._vectors, "memory_zone"):
|
||||
contexts.append(stack.enter_context(self._vectors.memory_zone(mem)))
|
||||
self.mem = mem
|
||||
yield mem
|
||||
self._clear_transient_orths()
|
||||
self.mem = self._non_temp_mem
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
"""Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
|
@ -148,8 +188,7 @@ cdef class Vocab:
|
|||
|
||||
cdef const LexemeC* get(self, Pool mem, str string) except NULL:
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
|
||||
`Lexeme` if necessary using memory acquired from the given pool. If the
|
||||
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
`Lexeme` if necessary.
|
||||
"""
|
||||
if string == "":
|
||||
return &EMPTY_LEXEME
|
||||
|
@ -180,19 +219,11 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, self.strings[orth])
|
||||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL:
|
||||
# I think this heuristic is bad, and the Vocab should always
|
||||
# own the lexemes. It avoids weird bugs this way, as it's how the thing
|
||||
# was originally supposed to work. The best solution to the growing
|
||||
# memory use is to periodically reset the vocab, which is an action
|
||||
# that should be up to the user to do (so we don't need to keep track
|
||||
# of the doc ownership).
|
||||
# TODO: Change the C API so that the mem isn't passed in here.
|
||||
# The mem argument is deprecated, replaced by memory zones. Same with
|
||||
# this size heuristic.
|
||||
mem = self.mem
|
||||
# if len(string) < 3 or self.length < 10000:
|
||||
# mem = self.mem
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
lex = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
||||
lex.orth = self.strings.add(string)
|
||||
lex.orth = self.strings.add(string, allow_transient=True)
|
||||
lex.length = len(string)
|
||||
if self.vectors is not None and hasattr(self.vectors, "key2row"):
|
||||
lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK)
|
||||
|
@ -202,18 +233,25 @@ cdef class Vocab:
|
|||
for attr, func in self.lex_attr_getters.items():
|
||||
value = func(string)
|
||||
if isinstance(value, str):
|
||||
value = self.strings.add(value)
|
||||
value = self.strings.add(value, allow_transient=True)
|
||||
if value is not None:
|
||||
Lexeme.set_struct_attr(lex, attr, value)
|
||||
if not is_oov:
|
||||
self._add_lex_to_vocab(lex.orth, lex)
|
||||
self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem)
|
||||
if lex == NULL:
|
||||
raise ValueError(Errors.E085.format(string=string))
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1:
|
||||
self._by_orth.set(lex.orth, <void*>lex)
|
||||
self.length += 1
|
||||
if is_transient and self.in_memory_zone:
|
||||
self._transient_orths.push_back(lex.orth)
|
||||
|
||||
def _clear_transient_orths(self):
|
||||
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
|
||||
for orth in self._transient_orths:
|
||||
map_clear(self._by_orth.c_map, orth)
|
||||
self._transient_orths.clear()
|
||||
|
||||
def __contains__(self, key):
|
||||
"""Check whether the string or int key has an entry in the vocabulary.
|
||||
|
@ -265,7 +303,7 @@ cdef class Vocab:
|
|||
"""
|
||||
cdef attr_t orth
|
||||
if isinstance(id_or_string, str):
|
||||
orth = self.strings.add(id_or_string)
|
||||
orth = self.strings.add(id_or_string, allow_transient=True)
|
||||
else:
|
||||
orth = id_or_string
|
||||
return Lexeme(self, orth)
|
||||
|
@ -417,7 +455,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#get_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
orth = self.strings.add(orth, allow_transient=True)
|
||||
cdef Lexeme lex = self[orth]
|
||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||
if self.has_vector(key):
|
||||
|
@ -436,7 +474,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#set_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
orth = self.strings.add(orth, allow_transient=False)
|
||||
cdef Lexeme lex = self[orth]
|
||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||
if self.vectors.is_full and key not in self.vectors:
|
||||
|
@ -460,7 +498,7 @@ cdef class Vocab:
|
|||
DOCS: https://spacy.io/api/vocab#has_vector
|
||||
"""
|
||||
if isinstance(orth, str):
|
||||
orth = self.strings.add(orth)
|
||||
orth = self.strings.add(orth, allow_transient=True)
|
||||
cdef Lexeme lex = self[orth]
|
||||
key = Lexeme.get_struct_attr(lex.c, self.vectors.attr)
|
||||
return key in self.vectors
|
||||
|
|
|
@ -31,6 +31,12 @@
|
|||
"name": "Bengali",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "bo",
|
||||
"name": "Tibetan",
|
||||
"example": "འདི་ཚིག་གྲུབ་རེད།",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "ca",
|
||||
"name": "Catalan",
|
||||
|
@ -480,6 +486,12 @@
|
|||
],
|
||||
"example": "这是一个用于示例的句子。",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "kmr",
|
||||
"name": "Kurdish Kurmanji",
|
||||
"example": "Ev hevokek e",
|
||||
"has_examples": true
|
||||
}
|
||||
],
|
||||
"licenses": [
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
|
|||
}
|
||||
|
||||
const navAlert = (
|
||||
<Link to="https://form.typeform.com/to/WlflqP1b" noLinkLayout>
|
||||
💥 Interested in <strong>Premium spaCy Models</strong>?
|
||||
<Link to="https://explosion.ai/blog/sp-global-commodities" noLinkLayout>
|
||||
💥 <strong>New:</strong> Case study with S&P Global
|
||||
</Link>
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user