diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml new file mode 100644 index 000000000..c5676ce49 --- /dev/null +++ b/.github/workflows/cibuildwheel.yml @@ -0,0 +1,92 @@ +name: Build + +on: + push: + tags: + # ytf did they invent their own syntax that's almost regex? + # ** matches 'zero or more of any character' + - 'release-v[0-9]+.[0-9]+.[0-9]+**' + - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**' +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + # macos-13 is an intel runner, macos-14 is apple silicon + os: [ubuntu-latest, windows-latest, macos-13] + + steps: + - uses: actions/checkout@v4 + - name: Build wheels + uses: pypa/cibuildwheel@v2.19.1 + env: + CIBW_SOME_OPTION: value + with: + package-dir: . + output-dir: wheelhouse + config-file: "{package}/pyproject.toml" + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./wheelhouse/*.whl + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build sdist + run: pipx run build --sdist + - uses: actions/upload-artifact@v4 + with: + name: cibw-sdist + path: dist/*.tar.gz + create_release: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + permissions: + contents: write + checks: write + actions: read + issues: read + packages: write + pull-requests: read + repository-projects: read + statuses: read + steps: + - name: Get the tag name and determine if it's a prerelease + id: get_tag_info + run: | + FULL_TAG=${GITHUB_REF#refs/tags/} + if [[ $FULL_TAG == release-* ]]; then + TAG_NAME=${FULL_TAG#release-} + IS_PRERELEASE=false + elif [[ $FULL_TAG == prerelease-* ]]; then + TAG_NAME=${FULL_TAG#prerelease-} + IS_PRERELEASE=true + else + echo "Tag does not match expected patterns" >&2 + exit 1 + fi + echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV + echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV + echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV + - uses: actions/download-artifact@v4 + with: + # unpacks all CIBW artifacts into dist/ + pattern: cibw-* + path: dist + merge-multiple: true + - name: Create Draft Release + id: create_release + uses: softprops/action-gh-release@v2 + if: startsWith(github.ref, 'refs/tags/') + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + name: ${{ env.TAG_NAME }} + draft: true + prerelease: ${{ env.IS_PRERELEASE }} + files: "./dist/*" diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml.disabled similarity index 100% rename from .github/workflows/gputests.yml rename to .github/workflows/gputests.yml.disabled diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml new file mode 100644 index 000000000..9f432874c --- /dev/null +++ b/.github/workflows/publish_pypi.yml @@ -0,0 +1,29 @@ +# The cibuildwheel action triggers on creation of a release, this +# triggers on publication. +# The expected workflow is to create a draft release and let the wheels +# upload, and then hit 'publish', which uploads to PyPi. + +on: + release: + types: + - published + +jobs: + upload_pypi: + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/spacy + permissions: + id-token: write + contents: read + if: github.event_name == 'release' && github.event.action == 'published' + # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) + # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + steps: + - uses: robinraju/release-downloader@v1 + with: + tag: ${{ github.event.release.tag_name }} + fileName: '*' + out-file-path: 'dist' + - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml.disabled similarity index 100% rename from .github/workflows/slowtests.yml rename to .github/workflows/slowtests.yml.disabled diff --git a/pyproject.toml b/pyproject.toml index bfd7e68d1..07ffe1677 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,5 +11,58 @@ requires = [ ] build-backend = "setuptools.build_meta" +[tool.cibuildwheel] +build = "*" +skip = "pp* cp36* cp37* cp38* *-win32" +test-skip = "" +free-threaded-support = false + +archs = ["native"] + +build-frontend = "default" +config-settings = {} +dependency-versions = "pinned" +environment = { PIP_CONSTRAINT = "build-constraints.txt" } + +environment-pass = [] +build-verbosity = 0 + +before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable" +before-build = "pip install -r requirements.txt && python setup.py clean" +repair-wheel-command = "" + +test-command = "" +before-test = "" +test-requires = [] +test-extras = [] + +container-engine = "docker" + +manylinux-x86_64-image = "manylinux2014" +manylinux-i686-image = "manylinux2014" +manylinux-aarch64-image = "manylinux2014" +manylinux-ppc64le-image = "manylinux2014" +manylinux-s390x-image = "manylinux2014" +manylinux-pypy_x86_64-image = "manylinux2014" +manylinux-pypy_i686-image = "manylinux2014" +manylinux-pypy_aarch64-image = "manylinux2014" + +musllinux-x86_64-image = "musllinux_1_2" +musllinux-i686-image = "musllinux_1_2" +musllinux-aarch64-image = "musllinux_1_2" +musllinux-ppc64le-image = "musllinux_1_2" +musllinux-s390x-image = "musllinux_1_2" + +[tool.cibuildwheel.linux] +repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}" + +[tool.cibuildwheel.macos] +repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}" + +[tool.cibuildwheel.windows] + +[tool.cibuildwheel.pyodide] + + [tool.isort] profile = "black" diff --git a/requirements.txt b/requirements.txt index 2ad92176d..7e7144d53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 -typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" # Development dependencies pre-commit>=2.13.0 cython>=0.25,<3.0 diff --git a/setup.cfg b/setup.cfg index ca8f64548..2917f67ed 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,7 +66,6 @@ install_requires = # Official Python utilities setuptools packaging>=20.0 - typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 [options.entry_points] diff --git a/spacy/about.py b/spacy/about.py index b7fd3751a..942a73194 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.5" +__version__ = "3.8.0.dev0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/lang/bo/__init__.py b/spacy/lang/bo/__init__.py new file mode 100644 index 000000000..84ef8c086 --- /dev/null +++ b/spacy/lang/bo/__init__.py @@ -0,0 +1,16 @@ +from ...language import BaseDefaults, Language +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS + + +class TibetanDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class Tibetan(Language): + lang = "bo" + Defaults = TibetanDefaults + + +__all__ = ["Tibetan"] diff --git a/spacy/lang/bo/examples.py b/spacy/lang/bo/examples.py new file mode 100644 index 000000000..8ed9372ec --- /dev/null +++ b/spacy/lang/bo/examples.py @@ -0,0 +1,16 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.bo.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།", + "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག", + "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།", + "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།", + "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།", + "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།", +] diff --git a/spacy/lang/bo/lex_attrs.py b/spacy/lang/bo/lex_attrs.py new file mode 100644 index 000000000..5535934af --- /dev/null +++ b/spacy/lang/bo/lex_attrs.py @@ -0,0 +1,65 @@ +from ...attrs import LIKE_NUM + +# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals + +_num_words = [ + "ཀླད་ཀོར་", + "གཅིག་", + "གཉིས་", + "གསུམ་", + "བཞི་", + "ལྔ་", + "དྲུག་", + "བདུན་", + "བརྒྱད་", + "དགུ་", + "བཅུ་", + "བཅུ་གཅིག་", + "བཅུ་གཉིས་", + "བཅུ་གསུམ་", + "བཅུ་བཞི་", + "བཅུ་ལྔ་", + "བཅུ་དྲུག་", + "བཅུ་བདུན་", + "བཅུ་པརྒྱད", + "བཅུ་དགུ་", + "ཉི་ཤུ་", + "སུམ་ཅུ", + "བཞི་བཅུ", + "ལྔ་བཅུ", + "དྲུག་ཅུ", + "བདུན་ཅུ", + "བརྒྱད་ཅུ", + "དགུ་བཅུ", + "བརྒྱ་", + "སྟོང་", + "ཁྲི་", + "ས་ཡ་", + " བྱེ་བ་", + "དུང་ཕྱུར་", + "ཐེར་འབུམ་", + "ཐེར་འབུམ་ཆེན་པོ་", + "ཁྲག་ཁྲིག་", + "ཁྲག་ཁྲིག་ཆེན་པོ་", +] + + +def like_num(text): + """ + Check if text resembles a number + """ + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/bo/stop_words.py b/spacy/lang/bo/stop_words.py new file mode 100644 index 000000000..407242c84 --- /dev/null +++ b/spacy/lang/bo/stop_words.py @@ -0,0 +1,198 @@ +# Source: https://zenodo.org/records/10148636 + +STOP_WORDS = set( + """ +འི་ +། +དུ་ +གིས་ +སོགས་ +ཏེ +གི་ +རྣམས་ +ནི +ཀུན་ +ཡི་ +འདི +ཀྱི་ +སྙེད་ +པས་ +གཞན་ +ཀྱིས་ +ཡི +ལ +ནི་ +དང་ +སོགས +ཅིང་ +ར +དུ +མི་ +སུ་ +བཅས་ +ཡོངས་ +ལས +ཙམ་ +གྱིས་ +དེ་ +ཡང་ +མཐའ་དག་ +ཏུ་ +ཉིད་ +ས +ཏེ་ +གྱི་ +སྤྱི +དེ +ཀ་ +ཡིན་ +ཞིང་ +འདི་ +རུང་ +རང་ +ཞིག་ +སྟེ +སྟེ་ +ན་རེ +ངམ +ཤིང་ +དག་ +ཏོ +རེ་ +འང་ +ཀྱང་ +ལགས་པ +ཚུ +དོ +ཡིན་པ +རེ +ན་རེ་ +ཨེ་ +ཚང་མ +ཐམས་ཅད་ +དམ་ +འོ་ +ཅིག་ +གྱིན་ +ཡིན +ན +ཁོ་ན་ +འམ་ +ཀྱིན་ +ལོ +ཀྱིས +བས་ +ལགས་ +ཤིག +གིས +ཀི་ +སྣ་ཚོགས་ +རྣམས +སྙེད་པ +ཡིས་ +གྱི +གི +བམ་ +ཤིག་ +རེ་རེ་ +ནམ +མིན་ +ནམ་ +ངམ་ +རུ་ +འགའ་ +ཀུན +ཤས་ +ཏུ +ཡིས +གིན་ +གམ་ +འོ +ཡིན་པ་ +མིན +ལགས +གྱིས +ཅང་ +འགའ +སམ་ +ཞིག +འང +ལས་ཆེ་ +འཕྲལ་ +བར་ +རུ +དང +ཡ +འག +སམ +ཀ +ཅུང་ཟད་ +ཅིག +ཉིད +དུ་མ +མ +ཡིན་བ +འམ +མམ +དམ +དག +ཁོ་ན +ཀྱི +ལམ +ཕྱི་ +ནང་ +ཙམ +ནོ་ +སོ་ +རམ་ +བོ་ +ཨང་ +ཕྱི +ཏོ་ +ཚོ +ལ་ལ་ +ཚོ་ +ཅིང +མ་གི་ +གེ +གོ +ཡིན་ལུགས་ +རོ་ +བོ +ལགས་པ་ +པས +རབ་ +འི +རམ +བས +གཞན +སྙེད་པ་ +འབའ་ +མཾ་ +པོ +ག་ +ག +གམ +སྤྱི་ +བམ +མོ་ +ཙམ་པ་ +ཤ་སྟག་ +མམ་ +རེ་རེ +སྙེད +ཏམ་ +ངོ +གྲང་ +ཏ་རེ +ཏམ +ཁ་ +ངེ་ +ཅོག་ +རིལ་ +ཉུང་ཤས་ +གིང་ +ཚ་ +ཀྱང +""".split() +) diff --git a/spacy/lang/gd/__init__.py b/spacy/lang/gd/__init__.py new file mode 100644 index 000000000..048a3a071 --- /dev/null +++ b/spacy/lang/gd/__init__.py @@ -0,0 +1,18 @@ +from typing import Optional + +from ...language import BaseDefaults, Language +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +class ScottishDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + + +class Scottish(Language): + lang = "gd" + Defaults = ScottishDefaults + + +__all__ = ["Scottish"] diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py new file mode 100644 index 000000000..9f5a66cbc --- /dev/null +++ b/spacy/lang/gd/stop_words.py @@ -0,0 +1,388 @@ +STOP_WORDS = set( + """ +'ad +'ar +'d # iad +'g # ag +'ga +'gam +'gan +'gar +'gur +'m # am +'n # an +'n seo +'na +'nad +'nam +'nan +'nar +'nuair +'nur +'s +'sa +'san +'sann +'se +'sna +a +a' +a'd # agad +a'm # agam +a-chèile +a-seo +a-sin +a-siud +a chionn +a chionn 's +a chèile +a chéile +a dh' +a h-uile +a seo +ac' # aca +aca +aca-san +acasan +ach +ag +agad +agad-sa +agads' +agadsa +agaibh +agaibhse +againn +againne +agam +agam-sa +agams' +agamsa +agus +aice +aice-se +aicese +aig +aig' # aige +aige +aige-san +aigesan +air +air-san +air neo +airsan +am +an +an seo +an sin +an siud +an uair +ann +ann a +ann a' +ann a shin +ann am +ann an +annad +annam +annam-s' +annamsa +anns +anns an +annta +aon +ar +as +asad +asda +asta +b' +bho +bhon +bhuaidhe # bhuaithe +bhuainn +bhuaipe +bhuaithe +bhuapa +bhur +brì +bu +c'à +car son +carson +cha +chan +chionn +choir +chon +chun +chèile +chéile +chòir +cia mheud +ciamar +co-dhiubh +cuide +cuin +cuin' +cuine +cà +cà' +càil +càit +càit' +càite +cò +cò mheud +có +d' +da +de +dh' +dha +dhaibh +dhaibh-san +dhaibhsan +dhan +dhasan +dhe +dhen +dheth +dhi +dhiom +dhiot +dhith +dhiubh +dhomh +dhomh-s' +dhomhsa +dhu'sa # dhut-sa +dhuibh +dhuibhse +dhuinn +dhuinne +dhuit +dhut +dhutsa +dhut-sa +dhà +dhà-san +dhàsan +dhòmhsa +diubh +do +docha +don +dà +dè +dè mar +dé +dé mar +dòch' +dòcha +e +eadar +eatarra +eatorra +eile +esan +fa +far +feud +fhad +fheudar +fhearr +fhein +fheudar +fheàrr +fhèin +fhéin +fhìn +fo +fodha +fodhainn +foipe +fon +fèin +ga +gach +gam +gan +ge brith +ged +gu +gu dè +gu ruige +gun +gur +gus +i +iad +iadsan +innte +is +ise +le +leam +leam-sa +leamsa +leat +leat-sa +leatha +leatsa +leibh +leis +leis-san +leoth' +leotha +leotha-san +linn +m' +m'a +ma +mac +man +mar +mas +mathaid +mi +mis' +mise +mo +mu +mu 'n +mun +mur +mura +mus +na +na b' +na bu +na iad +nach +nad +nam +nan +nar +nas +neo +no +nuair +o +o'n +oir +oirbh +oirbh-se +oirnn +oirnne +oirre +on +orm +orm-sa +ormsa +orra +orra-san +orrasan +ort +os +r' +ri +ribh +rinn +ris +rithe +rithe-se +rium +rium-sa +riums' +riumsa +riut +riuth' +riutha +riuthasan +ro +ro'n +roimh +roimhe +romhainn +romham +romhpa +ron +ruibh +ruinn +ruinne +sa +san +sann +se +seach +seo +seothach +shin +sibh +sibh-se +sibhse +sin +sineach +sinn +sinne +siod +siodach +siud +siudach +sna # ann an +sè +t' +tarsaing +tarsainn +tarsuinn +thar +thoigh +thro +thu +thuc' +thuca +thugad +thugaibh +thugainn +thugam +thugamsa +thuice +thuige +thus' +thusa +timcheall +toigh +toil +tro +tro' # troimh +troimh +troimhe +tron +tu +tusa +uair +ud +ugaibh +ugam-s' +ugam-sa +uice +uige +uige-san +umad +unnta # ann an +ur +urrainn +à +às +àsan +á +ás +è +ì +ò +ó +""".split( + "\n" + ) +) diff --git a/spacy/lang/gd/tokenizer_exceptions.py b/spacy/lang/gd/tokenizer_exceptions.py new file mode 100644 index 000000000..76e169d90 --- /dev/null +++ b/spacy/lang/gd/tokenizer_exceptions.py @@ -0,0 +1,1983 @@ +from ...symbols import NORM, ORTH +from ...util import update_exc +from ..tokenizer_exceptions import BASE_EXCEPTIONS + +""" + All rules and exceptions were taken from the "Gaelic Orthographic Conventions +of 2009" (GOC) and from the "Annotated Reference Corpus of Scottish Gaelic" (ARCOSG). I did +my best to ensure this tokenizer would lead to text as close as possible to the +tokenization of the ARCOSG and the conventions in the GOC. + + +ARCOSG: https://github.com/Gaelic-Algorithmic-Research-Group/ARCOSG +GOC: https://www.gaidhlig.scot/wp-content/uploads/2021/03/GOC-2009-English.pdf +""" + +# Compound words +_exc = { + "càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}], + "sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}], + "orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}], + "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}], +} + + +# Hyphenations that are alternative forms of words +for exc_data in [ + {ORTH: "fa-near", NORM: "fainear"}, + {ORTH: "Fa-near", NORM: "Fainear"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + + +# Abreviations and shortened words +for exc_data in [ + {ORTH: "'", NORM: "a"}, + {ORTH: "'S", NORM: "Agus"}, + {ORTH: "'s", NORM: "agus"}, + {ORTH: "B'", NORM: "Bu"}, + {ORTH: "b'", NORM: "bu"}, + {ORTH: "D'", NORM: "Do"}, + {ORTH: "d'", NORM: "do"}, + {ORTH: "'M", NORM: "Am"}, + {ORTH: "'m", NORM: "am"}, + {ORTH: "M'", NORM: "Mo"}, + {ORTH: "m'", NORM: "mo"}, + {ORTH: "'n", NORM: "an"}, + {ORTH: "'N", NORM: "An"}, + {ORTH: "Th'", NORM: "Tha"}, + {ORTH: "th'", NORM: "tha"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + + +# Words with a leading apostrophe +for orth in """ + 'ac + 'Ac + 'ad + 'Ad + 'ar + 'Ar + 'bhuannachd + 'Bhuannachd + 'd + 'D + 'eil + 'Eil + 'eug + 'Eug + 'g + 'G + 'ga + 'Ga + 'gad + 'Gad + 'gam + 'Gam + 'gan + 'Gan + 'gar + 'Gar + 'gur + 'Gur + 'ic + 'Ic + 'il + 'Il + 'ill' + 'Ill' + 'ille + 'Ille + 'illean + 'Illean + 'iodh + 'Iodh + 'l + 'L + 'm + 'M + 'n + 'N + 'na + 'Na + 'nad + 'Nad + 'nam + 'Nam + 'nan + 'Nan + 'nar + 'Nar + 'neil + 'Neil + 'nise + 'Nise + 'nuair + 'Nuair + 'nur + 'Nur + 's + 'S + 'sa + 'Sa + 'sa' + 'Sa' + 'san + 'San + 'sann + 'Sann + 'se + 'Se + 'sna + 'Sna + 'son + 'Son + 'urchaidh + 'Urchaidh + """.split(): + _exc[orth] = [{ORTH: orth}] + +# Words with a trailing or middling apostrophe +for orth in """ + a' + A' + a'd + A'd + a'm + A'm + a's + A's + ac' + Ac' + agads' + Agads' + agams' + Agams' + aig' + Aig' + annams' + Annams' + ars' + Ars' + b' + B' + ball' + Ball' + bioraicht' + Bioraicht' + bh' + Bh' + bhail' + Bhail' + bhall' + Bhall' + bheath' + Bheath' + bhliadhn' + Bhliadhn' + bliadhn' + Bliadhn' + bonnant' + Bonnant' + brist' + Brist' + bàt' + Bàt' + c'à + C'à + camp' + Camp' + chalp' + Chalp' + champ' + Champ' + chomhairl' + Chomhairl' + chual' + Chual' + chuimhn' + Chuimhn' + colaisd' + Colaisd' + comhl' + Comhl' + comhairl' + Comhairl' + creids' + Creids' + cual' + Cual' + cuimhn' + Cuimhn' + cuin' + Cuin' + cà' + Cà' + càit' + Càit' + d' + D' + d'readh + D'readh + d'reaghadh + D'reaghadh + daoin' + Daoin' + deimhinn' + Deimhinn' + de'n + De'n + dh' + Dh' + dhaib' + Dhaib' + dhaoin' + Dhaoin' + dhòmhs' + Dhòmhs' + dhu'sa + Dhu'sa + dhuin' + Dhuin' + do'n + Do'n + duin' + Duin' + dòch' + Dòch' + dùint' + Dùint' + eil' + Eil' + f'a + F'a + fac' + Fac' + fad' + Fad' + fhac' + Fhac' + fhad' + Fhad' + fhaid' + Fhaid' + fhaisg' + Fhaisg' + fhiosd' + Fhiosd' + fàilt' + Fàilt' + g' + G' + gàir' + Gàir' + ghill' + Ghill' + gill' + Gill' + inns' + Inns' + innt' + Innt' + ionnsaicht' + Ionnsaicht' + leams' + Leams' + leoth' + Leoth' + lobht' + Lobht' + m' + M' + m'a + M'a + m's + M's + mhuth' + Mhuth' + mhòr' + Mhòr' + mis' + Mis' + mu'n + Mu'n + mòr' + Mòr' + oirr' + Oirr' + o'n + O'n + phàp' + Phàp' + pàp' + Pàp' + pòs' + Pòs' + prionns' + Prionns' + r' + R' + riums' + Riums' + riuth' + Riuth' + ro'n + Ro'n + sa' + Sa' + sgoil' + Sgoil' + sgìr' + Sgìr' + sheòrs' + Sheòrs' + sin' + Sin' + stall' + Stall' + sìod' + Sìod' + sònraicht' + Sònraicht' + t' + T' + taigh' + Taigh' + tein' + Tein' + teoth' + Teoth' + th' + Th' + thoilicht' + Thoilicht' + thuc' + Thuc' + thuigs' + Thuigs' + thus' + Thus' + thàna' + Thàna' + toilicht' + Toilicht' + tro' + Tro' + uisg' + Uisg' + àit' + Àit' + òg' + Òg' + """.split(): + _exc[orth] = [{ORTH: orth}] + + +# Hyphenations that should remain as single tokens +for orth in """ +'n-dràsda +'N-dràsda +-bhliadhn' +-bhliadhn' +a-bhos +A-bhos +a-bhòn-dè +A-bhòn-dè +a-cheart +A-cheart +a-chèile +A-chèile +a-deas +A-deas +a-mach +A-mach +a-mhàin +A-mhàin +a-muigh +A-muigh +a-màireach +A-màireach +a-nall +A-nall +a-neist +A-neist +a-ni +A-ni +a-nis +A-nis +a-nisd +A-nisd +a-nise +A-nise +a-nist +A-nist +a-niste +A-niste +a-nochd +A-nochd +a-nuas +A-nuas +a-null +A-null +a-raoir +A-raoir +a-riamh +A-riamh +a-rithist +A-rithist +a-rèiste +A-rèiste +a-rìs +A-rìs +a-seo +A-seo +a-sin +A-sin +a-sineach +A-sineach +a-siud +A-siud +a-staigh +A-staigh +a-steach +A-steach +a-tuath +A-tuath +aca-san +Aca-san +agad-sa +Agad-sa +agam-sa +Agam-sa +aghaidh-bhualaich +Aghaidh-bhualaich +aice-se +Aice-se +aige-san +Aige-san +ainmeannan-àite +Ainmeannan-àite +air-san +Air-san +am-bliadhna +Am-bliadhna +am-màireach +Am-màireach +amp-head +Amp-head +an-diugh +An-diugh +an-dràsd +An-dràsd +an-dràsda +An-dràsda +an-dràst +An-dràst +an-dràsta +An-dràsta +an-dè +An-dè +an-dé +An-dé +an-nise +An-nise +an-nochd +An-nochd +an-raoir +An-raoir +an-uiridh +An-uiridh +an-àbhaisteach +An-àbhaisteach +an-àird +An-àird +an-àirde +An-àirde +an-àrda +An-àrda +ana-ceartas +Ana-ceartas +ana-seo +Ana-seo +ana-sin +Ana-sin +ana-siud +Ana-siud +annam-s' +Annam-s' +ao-coltach +Ao-coltach +aobhar-sa +Aobhar-sa +aois-léinn +Aois-léinn +aona-ghnothaich +Aona-ghnothaich +ar-a-mach +Ar-a-mach +ard-easbaig +Ard-easbaig +ard-luchd-poilitics +Ard-luchd-poilitics +ath-bhaile +Ath-bhaile +ath-bheòthachadh +Ath-bheòthachadh +ath-bhliadhna +Ath-bhliadhna +ath-ghiollachd +Ath-ghiollachd +ath-nuadhais +Ath-nuadhais +ath-sgrùdadh +Ath-sgrùdadh +ath-thriop +Ath-thriop +athair-san +Athair-san +baile-ciùird +Baile-ciùird +ball-coise +Ball-coise +ball-pàrlamaid +Ball-pàrlamaid +ball-sampaill +Ball-sampaill +balla-mara +Balla-mara +ban-chompanach +Ban-chompanach +ban-fhuamhaire +Ban-fhuamhaire +ban-ghillìosach +Ban-ghillìosach +ban-righ'nn +Ban-righ'nn +ban-rìgh +Ban-rìgh +bana-bhàird +Bana-bhàird +bana-chompanaich +Bana-chompanaich +bana-phòsda +Bana-phòsda +banas-taighe +Banas-taighe +beairt-fhuaigheil +Beairt-fhuaigheil +beairt-fuaigheil +Beairt-fuaigheil +bean-gairm +Bean-gairm +bean-phòsta +Bean-phòsta +bean-taighe +Bean-taighe +beul-aithris +Beul-aithris +beò-shlàint +Beò-shlàint +beò-shlàint' +Beò-shlàint' +beò-shlàinte +Beò-shlàinte +bhaga-sgoil +Bhaga-sgoil +bhall-pàrlamaid +Bhall-pàrlamaid +bhan-chompanach +Bhan-chompanach +bhan-dòmhnallach +Bhan-dòmhnallach +bhan-phrionnsa +Bhan-phrionnsa +bhan-righinn +Bhan-righinn +bhan-sheinneadair +Bhan-sheinneadair +bharr-iall +Bharr-iall +bhata-làidir +Bhata-làidir +bhath-room +Bhath-room +bheachd-sa +Bheachd-sa +bheachd-san +Bheachd-san +bheairt-fhighe +Bheairt-fhighe +bheairtean-fuaigheil +Bheairtean-fuaigheil +bheinn-sheilg +Bheinn-sheilg +bheul-aithris +Bheul-aithris +bheò-ghlacadh +Bheò-ghlacadh +bhith-beò +Bhith-beò +bhithinn-sa +Bhithinn-sa +bhogsa-chiùil +Bhogsa-chiùil +bhonn-stéidh +Bhonn-stéidh +bhràithrean-sa +Bhràithrean-sa +bhuain-mhòine +Bhuain-mhòine +bhun-sheòrsa +Bhun-sheòrsa +bhàn-righinn +Bhàn-righinn +bhàn-rinn +Bhàn-rinn +bhàn-rìgh +Bhàn-rìgh +bhàta-aiseig +Bhàta-aiseig +bhàta-sa +Bhàta-sa +bird-watcher +Bird-watcher +bith-beò +Bith-beò +bithinn-sa +Bithinn-sa +bliadhna-sa +Bliadhna-sa +bogha-saighead +Bogha-saighead +boma-peatroil +Boma-peatroil +bristeadh-a-mach +Bristeadh-a-mach +buidhean-cathrannais +Buidhean-cathrannais +buille-a-mach +Buille-a-mach +buille-shaor +Buille-shaor +bun-adhbharan +Bun-adhbharan +bun-chomharraidhean +Bun-chomharraidhean +bun-fhiosrachadh +Bun-fhiosrachadh +bun-sgoil +Bun-sgoil +bun-stèidh +Bun-stèidh +bàt-aiseig +Bàt-aiseig +bàta-aiseig +Bàta-aiseig +bàta-bathair +Bàta-bathair +cainnt-san +Cainnt-san +cal-mac +Cal-mac +carraighean-cuimhne +Carraighean-cuimhne +cead-telebhisean +Cead-telebhisean +ceann-cinnidh +Ceann-cinnidh +ceann-suidhe +Ceann-suidhe +chanain-sa +Chanain-sa +chaolas-arcach +Chaolas-arcach +charge-adh +Charge-adh +cheala-deug +Cheala-deug +chealla-deug +Chealla-deug +cheann-cinnidh +Cheann-cinnidh +cheann-feadhna +Cheann-feadhna +cheann-suidhe +Cheann-suidhe +chearc-fhraoich +Chearc-fhraoich +chearcall-meadhain +Chearcall-meadhain +chearcall-mheadhain +Chearcall-mheadhain +chlann-nighean +Chlann-nighean +chlàr-ama +Chlàr-ama +chlò-bhuaileadh +Chlò-bhuaileadh +chlò-bhualadh +Chlò-bhualadh +cho-chreutairean +Cho-chreutairean +cho-dhùin +Cho-dhùin +cho-dhùnadh +Cho-dhùnadh +cho-dhùnaidhean +Cho-dhùnaidhean +cho-fhaireachdainn +Cho-fhaireachdainn +cho-labhairt +Cho-labhairt +cho-obraiche +Cho-obraiche +cho-roinn +Cho-roinn +chom-pàirt +Chom-pàirt +chorra-ghritheach +Chorra-ghritheach +chrann-snàth +Chrann-snàth +chreach-s' +Chreach-s' +chrith-thalmhainn +Chrith-thalmhainn +chàch-a-chéile +Chàch-a-chéile +cinn-chuspair +Cinn-chuspair +cinn-iùil +Cinn-iùil +cion-doighe +Cion-doighe +clachan-meallain +Clachan-meallain +clann-sgoile +Clann-sgoile +claon-fhaireachdainn +Claon-fhaireachdainn +claon-shamhail +Claon-shamhail +cluicheadairean-meadhain +Cluicheadairean-meadhain +clàran-ama +Clàran-ama +cléir-seanchain +Cléir-seanchain +clò-bhualadair +Clò-bhualadair +clò-bhualadh +Clò-bhualadh +co-aimsireach +Co-aimsireach +co-bhanntachd +Co-bhanntachd +co-bhuannachd +Co-bhuannachd +co-buannachd +Co-buannachd +co-cheangail +Co-cheangail +co-cheangailte +Co-cheangailte +co-cheangal +Co-cheangal +co-chreutairean +Co-chreutairean +co-chruinneachadh +Co-chruinneachadh +co-dhiu +Co-dhiu +co-dhiubh +Co-dhiubh +co-dhiù +Co-dhiù +co-dhiùbh +Co-dhiùbh +co-dhùnadh +Co-dhùnadh +co-dhùnaidhean +Co-dhùnaidhean +co-fhaireachadh +Co-fhaireachadh +co-fhaireachdainn +Co-fhaireachdainn +co-impirean +Co-impirean +co-ionad +Co-ionad +co-ionann +Co-ionann +co-labhairt +Co-labhairt +co-labhairtean +Co-labhairtean +co-obrachadh +Co-obrachadh +co-sheirm +Co-sheirm +co-theacs +Co-theacs +coimeas-meudachd +Coimeas-meudachd +cola-deug +Cola-deug +com-pàirt +Com-pàirt +cope-adh +Cope-adh +crann-aodaich +Crann-aodaich +crann-snàth +Crann-snàth +crann-tarsainn +Crann-tarsainn +craobh-sgaoileadh +Craobh-sgaoileadh +crith-thalmhainn +Crith-thalmhainn +cruth-rannsachadh +Cruth-rannsachadh +cuid-eigin +Cuid-eigin +cumail-san +Cumail-san +cur-gu-buil +Cur-gu-buil +cur-seachad +Cur-seachad +cur-seachadan +Cur-seachadan +cìs-comhairle +Cìs-comhairle +cò-dhunadh +Cò-dhunadh +còmhlan-ciùil +Còmhlan-ciùil +cùis-lagh +Cùis-lagh +cùl-chàineadh +Cùl-chàineadh +cùl-shleamhnach +Cùl-shleamhnach +cùl-taic +Cùl-taic +da-rìreabh +Da-rìreabh +da-rìreadh +Da-rìreadh +da-rìribh +Da-rìribh +deagh-ghean +Deagh-ghean +dearg-fhuileach +Dearg-fhuileach +deireadh-sheachdain +Deireadh-sheachdain +deoch-làidir +Deoch-làidir +dha-rìreabh +Dha-rìreabh +dha-rìribh +Dha-rìribh +dhaibh-san +Dhaibh-san +dhe-salin-adh +Dhe-salin-adh +dhe-salt-adh +Dhe-salt-adh +dheidhinn-sa +Dheidhinn-sa +dhol-sìos +Dhol-sìos +dhomh-s' +Dhomh-s' +dhuine-dubh +Dhuine-dubh +dhà-san +Dhà-san +dhòigh-beatha +Dhòigh-beatha +di-sathairne +Di-sathairne +dian-amharc +Dian-amharc +dlùth-cheangal +Dlùth-cheangal +do-chreidsinneach +Do-chreidsinneach +do-labhairt +Do-labhairt +do-sheachant' +Do-sheachant' +dol-a-mach +Dol-a-mach +dol-air-adhart +Dol-air-adhart +dubh-chàineadh +Dubh-chàineadh +dubh-ghorm +Dubh-ghorm +dà-chultarach +Dà-chultarach +dà-reug +Dà-reug +dàn-mòr +Dàn-mòr +dì-moladh +Dì-moladh +dòigh-beatha +Dòigh-beatha +dòighean-beatha +Dòighean-beatha +e-mail +E-mail +eadar-dhealachadh +Eadar-dhealachadh +eadar-dhealachaidhean +Eadar-dhealachaidhean +eadar-dhealaichte +Eadar-dhealaichte +eadar-nàiseanta +Eadar-nàiseanta +earbainn-s +Earbainn-s +eàrr-ràdh +Eàrr-ràdh +eòrp-innseanach +Eòrp-innseanach +fa-leth +Fa-leth +fa-near +Fa-near +fad-as +Fad-as +fad-thréimhseach +Fad-thréimhseach +feadaig-mhonaidh +Feadaig-mhonaidh +fealla-dhà +Fealla-dhà +fear-a-ropa +Fear-a-ropa +fear-ceasnachaidh +Fear-ceasnachaidh +fear-faire +Fear-faire +fear-gairm +Fear-gairm +fear-glèidhidh +Fear-glèidhidh +fear-labhairt +Fear-labhairt +fear-naidheachd +Fear-naidheachd +fear-pòsta +Fear-pòsta +fear-sgrùdaidh +Fear-sgrùdaidh +fear-teagaisg +Fear-teagaisg +fear-trèinidh +Fear-trèinidh +fear-éisteachd +Fear-éisteachd +feed-adh +Feed-adh +fhear-ghlèidhidh +Fhear-ghlèidhidh +fhear-gleidhidh +Fhear-gleidhidh +fhear-glèidhidh +Fhear-glèidhidh +fhear-labhairt +Fhear-labhairt +fhear-leughaidh +Fhear-leughaidh +fhear-sa +Fhear-sa +fhear-sgrùdaidh +Fhear-sgrùdaidh +fhir-cinnidh +Fhir-cinnidh +fhéin-ìomhaigh +Fhéin-ìomhaigh +fhìor-luachmhor +Fhìor-luachmhor +fois-fhòirneirt +Fois-fhòirneirt +fàs-bheairtean +Fàs-bheairtean +féin-mhisneachd +Féin-mhisneachd +féin-mholadh +Féin-mholadh +fìor-thàbhachdach +Fìor-thàbhachdach +ge-ta +Ge-ta +ge-tà +Ge-tà +ged-tà +Ged-tà +geàrr-chunntais +Geàrr-chunntais +geàrr-chunntas +Geàrr-chunntas +geàrr-thréimhseach +Geàrr-thréimhseach +ghuth-thàmh +Ghuth-thàmh +glain'-amhairc +Glain'-amhairc +glas-ghuib +Glas-ghuib +gnàth-bhriathrachas +Gnàth-bhriathrachas +gàrradh-crìche +Gàrradh-crìche +h- +H- +h-ana-miannaibh +H-ana-miannaibh +h-uile +H-uile +hó-ró +Hó-ró +iar-mhinistear +Iar-mhinistear +inneal-spreadhaidh +Inneal-spreadhaidh +ionad-còmhnaidh +Ionad-còmhnaidh +join-adh +Join-adh +latha-an-diugh +Latha-an-diugh +leam-sa +Leam-sa +leas-adh +Leas-adh +lease-adh +Lease-adh +leat-sa +Leat-sa +leotha-san +Leotha-san +leth-char +Leth-char +leth-cheud +Leth-cheud +leth-ghàidhealtachd +Leth-ghàidhealtachd +leth-pocannan +Leth-pocannan +leth-sgeulan +Leth-sgeulan +leth-uair +Leth-uair +leughadh-ne +Leughadh-ne +lighiche-sprèidh +Lighiche-sprèidh +linn-an-òir +Linn-an-òir +litir-aonta +Litir-aonta +loma-làn +Loma-làn +lost-s' +Lost-s' +luchd-altram +Luchd-altram +luchd-altruim +Luchd-altruim +luchd-amhairc +Luchd-amhairc +luchd-ciùil +Luchd-ciùil +luchd-cruinneachaidh +Luchd-cruinneachaidh +luchd-dìon +Luchd-dìon +luchd-ealain +Luchd-ealain +luchd-einnseanaraidh +Luchd-einnseanaraidh +luchd-glèidhteachais +Luchd-glèidhteachais +luchd-gnìomhachais +Luchd-gnìomhachais +luchd-iomairt +Luchd-iomairt +luchd-lagh +Luchd-lagh +luchd-lagha +Luchd-lagha +luchd-leanmhainn +Luchd-leanmhainn +luchd-litreachais +Luchd-litreachais +luchd-obrach +Luchd-obrach +luchd-reic +Luchd-reic +luchd-sgrùdaidh +Luchd-sgrùdaidh +luchd-teagaisg +Luchd-teagaisg +luchd-turais +Luchd-turais +luchd-éisdeachd +Luchd-éisdeachd +luchd-éisteachd +Luchd-éisteachd +là-an-diugh +Là-an-diugh +làmh-chuideachaidh +Làmh-chuideachaidh +làmh-sgrìobhainn +Làmh-sgrìobhainn +làmh-sgrìobhainnean +Làmh-sgrìobhainnean +làmh-sgrìobhta +Làmh-sgrìobhta +làn-bheachd +Làn-bheachd +làn-ghàidhealtachd +Làn-ghàidhealtachd +làn-thuigse +Làn-thuigse +làn-ùine +Làn-ùine +làrna-mhàireach +Làrna-mhàireach +lìn-bheaga +Lìn-bheaga +lùth-chleasan +Lùth-chleasan +ma-ta +Ma-ta +ma-tha +Ma-tha +ma-thà +Ma-thà +ma-tà +Ma-tà +mac-an-duine +Mac-an-duine +mac-léinn +Mac-léinn +mac-meanmna +Mac-meanmna +maighstir-sgoile +Maighstir-sgoile +maor-chladaich +Maor-chladaich +maor-fearainn +Maor-fearainn +mar-thà +Mar-thà +marbh-riaghailt +Marbh-riaghailt +meadhan-aoiseil +Meadhan-aoiseil +meadhan-latha +Meadhan-latha +meadhan-oidhche +Meadhan-oidhche +meal-an-naidheachd +Meal-an-naidheachd +mean-fhàs +Mean-fhàs +mhac-meanmna +Mhac-meanmna +mheadhain-latha +Mheadhain-latha +mheadhain-oidhche +Mheadhain-oidhche +mheadhan-oidhche +Mheadhan-oidhche +mheantraiginn-sa +Mheantraiginn-sa +mhi-rùn +Mhi-rùn +mhic-an-duine +Mhic-an-duine +mhoraltachd-sa +Mhoraltachd-sa +mhuir-làn +Mhuir-làn +mhuir-sgèin +Mhuir-sgèin +mhàthair-san +Mhàthair-san +mhì-chinnt +Mhì-chinnt +mhì-chneasda +Mhì-chneasda +mhì-chòrdadh +Mhì-chòrdadh +mhì-riaraichte +Mhì-riaraichte +mhì-shocair +Mhì-shocair +mhòr-chuid +Mhòr-chuid +mhòr-shluagh +Mhòr-shluagh +mhòr-shluaigh +Mhòr-shluaigh +mhór-amharas +Mhór-amharas +mhór-chuid +Mhór-chuid +mhór-shluaigh +Mhór-shluaigh +mi-chneasda +Mi-chneasda +mi-rùn +Mi-rùn +mic-léinn +Mic-léinn +mion-chànain +Mion-chànain +mion-fhios +Mion-fhios +mion-fhiosrach +Mion-fhiosrach +mion-sgrùdadh +Mion-sgrùdadh +muir-meadhon-thireach +Muir-meadhon-thireach +mèinnean-talmhainn +Mèinnean-talmhainn +mì-chinnt +Mì-chinnt +mì-choltach +Mì-choltach +mì-dhòigh +Mì-dhòigh +mì-fhair +Mì-fhair +mì-fhortanach +Mì-fhortanach +mì-laghail +Mì-laghail +mì-nàdarra +Mì-nàdarra +mì-nàdarrach +Mì-nàdarrach +mì-rùin +Mì-rùin +mì-shealbhach +Mì-shealbhach +mì-thlachd +Mì-thlachd +mòr-shluagh +Mòr-shluagh +mór-bhuannachd +Mór-bhuannachd +mór-chuid +Mór-chuid +mór-roinn +Mór-roinn +n- +N- +neach-casaid +Neach-casaid +neach-cathrach +Neach-cathrach +neach-gairm +Neach-gairm +neo-chiontach +Neo-chiontach +neo-eisimeileach +Neo-eisimeileach +neo-iomlan +Neo-iomlan +neo-àbhaisteach +Neo-àbhaisteach +nua-bhàrdachd +Nua-bhàrdachd +nì-eigin +Nì-eigin +obair-sa +Obair-sa +oifigear-stiùiridh +Oifigear-stiùiridh +oirbh-se +Oirbh-se +ola-thruis +Ola-thruis +orm-sa +Orm-sa +orra-san +Orra-san +phiuthar-chéile +Phiuthar-chéile +phort-adhair +Phort-adhair +phump-adh +Phump-adh +phàipeir-naidheachd +Phàipeir-naidheachd +phòcaid-thòine +Phòcaid-thòine +pole-aichean +Pole-aichean +port-adhair +Port-adhair +proove-adh +Proove-adh +pàipear-naidheachd +Pàipear-naidheachd +pàipearan-naidheachd +Pàipearan-naidheachd +radio-beò +Radio-beò +rithe-se +Rithe-se +rium-sa +Rium-sa +ro-chumhang +Ro-chumhang +ro-eòlach +Ro-eòlach +ro-innleachd +Ro-innleachd +ro-làimh +Ro-làimh +ro-shealladh +Ro-shealladh +roth-thoisich +Roth-thoisich +rèidio-beò +Rèidio-beò +rùm-cùil +Rùm-cùil +sadadh-a-steach +Sadadh-a-steach +samhradh-a-chaidh +Samhradh-a-chaidh +saor-làithean +Saor-làithean +sead-fhighe +Sead-fhighe +sean-ghnàthas +Sean-ghnàthas +seana-bhliadhn' +Seana-bhliadhn' +seirbhis-aisig +Seirbhis-aisig +seòl-mara +Seòl-mara +seòmar-cadail +Seòmar-cadail +sgeulachdan-gaisge +Sgeulachdan-gaisge +sgoil-marcaidheachd +Sgoil-marcaidheachd +sgìr-easbaig +Sgìr-easbaig +sgìre-easbaig +Sgìre-easbaig +sheann-fhasanta +Sheann-fhasanta +shlatan-connaidh +Shlatan-connaidh +shon-sa +Shon-sa +shàr-sgoilear +Shàr-sgoilear +sibh-se +Sibh-se +snodha-gàire +Snodha-gàire +so-labhairt +So-labhairt +soch-mhalairteach +Soch-mhalairteach +spor-gunna +Spor-gunna +sàr-bheachdan +Sàr-bheachdan +sìor-dhol +Sìor-dhol +sùil-air-ais +Sùil-air-ais +sùil-mhara +Sùil-mhara +t- +T- +taigh-cuibhle +Taigh-cuibhle +taigh-céilidh +Taigh-céilidh +taigh-sa +Taigh-sa +taigh-sheinnse +Taigh-sheinnse +taigh-tasgaidh +Taigh-tasgaidh +taigh-tughaidh +Taigh-tughaidh +taigh-òsda +Taigh-òsda +taigh-òsta +Taigh-òsta +taighean-aoigheachd +Taighean-aoigheachd +taobh-sa +Taobh-sa +teachd-an-tìr +Teachd-an-tìr +teaghlach-chànanan +Teaghlach-chànanan +thaicean-airgid +Thaicean-airgid +thaighean-altraim +Thaighean-altraim +thonn-gheal +Thonn-gheal +thuigse-san +Thuigse-san +tigh-croiteir +Tigh-croiteir +tigh-còmhnaidh +Tigh-còmhnaidh +tigh-seinnse +Tigh-seinnse +tigh-sheinnse +Tigh-sheinnse +tighearnan-fearainn +Tighearnan-fearainn +togail-cridhe +Togail-cridhe +travel-adh +Travel-adh +triob-sa +Triob-sa +tro-chèile +Tro-chèile +troimh-a-chéile +Troimh-a-chéile +troimh-chèile +Troimh-chèile +troimhe-chéile +Troimhe-chéile +tuathanas-éisg +Tuathanas-éisg +tè-labhairt +Tè-labhairt +tìr-mhóir +Tìr-mhóir +tìr-mòr +Tìr-mòr +ugam-s' +Ugam-s' +ugam-sa +Ugam-sa +uige-san +Uige-san +uile-gu-lèir +Uile-gu-lèir +uile-tuigseach +Uile-tuigseach +use-agadh +Use-agadh +watch-adh +Watch-adh +weld-adh +Weld-adh +àrd-cheannard +Àrd-cheannard +àrd-chomhairliche +Àrd-chomhairliche +àrd-chonstabal +Àrd-chonstabal +àrd-dhuine +Àrd-dhuine +àrd-ionmhair +Àrd-ionmhair +àrd-oifigear +Àrd-oifigear +àrd-oifigeir +Àrd-oifigeir +àrd-sgoil +Àrd-sgoil +àrd-ìre +Àrd-ìre +àrd-ùrlair +Àrd-ùrlair +àrd-ùrlar +Àrd-ùrlar +às-creideach +Às-creideach +àtha-cheilpe +Àtha-cheilpe +ìre-sa +Ìre-sa +ìre-se +Ìre-se +òg-mhios +Òg-mhios +òige-sa +Òige-sa +òrd-mhòr +Òrd-mhòr""".split(): + _exc[orth] = [{ORTH: orth}] + +# Multiple words that should remain as one token +for orth in """'n diugh +'N diugh +'n dà +'N dà +'n iar +'N iar +'n seo +'N seo +'n uairsin +'N uairsin +a a sineach +A a sineach +a b' +A b' +a bhos +A bhos +a bhàn +A bhàn +a bhòn raoir +A bhòn raoir +a bhòn uiridh +A bhòn uiridh +a bhòn-dè +A bhòn-dè +a bhòn-raoir +A bhòn-raoir +a bhòn-uiridh +A bhòn-uiridh +a bu' +A bu' +a chaoidh +A chaoidh +a cheana +A cheana +a chionn +A chionn +a chionn 's +A chionn 's +a chuile +A chuile +a chèil +A chèil +a chèile +A chèile +a chéile +A chéile +a deas +A deas +a dh' +A dh' +a h-uile +A h-uile +a mach +A mach +a muigh +A muigh +a màireach +A màireach +a nall +A nall +a neisd +A neisd +a nis +A nis +a nisd +A nisd +a nise +A nise +a niste +A niste +a nochd +A nochd +a nuas +A nuas +a null +A null +a raoir +A raoir +a riamh +A riamh +a rithist +A rithist +a s +A s +a seo +A seo +a seothach +A seothach +a shineach +A shineach +a sin +A sin +a sineach +A sineach +a staidh +A staidh +a staigh +A staigh +a steach +A steach +a stigh +A stigh +a tuath +A tuath +a uiridh +A uiridh +a' diugh +A' diugh +a' s +A' s +air bith +Air bith +air choireigin +Air choireigin +air choireigin-ach +Air choireigin-ach +air choreigin +Air choreigin +air dheireadh +Air dheireadh +air falbh +Air falbh +air neo +Air neo +air thùs +Air thùs +am a màireach muigh +Am a màireach muigh +am bliadhna +Am bliadhna +am muigh +Am muigh +an am +An am +an aodann bàn +An aodann bàn +an ath bhliadhna +An ath bhliadhna +an ath oidhch' +An ath oidhch' +an ath oidhche +An ath oidhche +an ath sheachdain +An ath sheachdain +an ath sheachdainn +An ath sheachdainn +an ath-bhliadhna +An ath-bhliadhna +an ath-oidhch' +An ath-oidhch' +an ath-oidhche +An ath-oidhche +an ath-sheachdain +An ath-sheachdain +an ath-sheachdainn +An ath-sheachdainn +an ceart-uair +An ceart-uair +an ceartuair +An ceartuair +an còmhnaidh +An còmhnaidh +an de +An de +an deas +An deas +an diugh +An diugh +an dràsda +An dràsda +an dràsta +An dràsta +an dè +An dè +an ear +An ear +an earair +An earair +an earar +An earar +an earras +An earras +an iar +An iar +an iaras +An iaras +an làrna-mhàireach +An làrna-mhàireach +an raoir +An raoir +an sean +An sean +an seo +An seo +an seothach +An seothach +an sin +An sin +an sineach +An sineach +an siod +An siod +an siud +An siud +an siudach +An siudach +an toiseach +An toiseach +an uair +An uair +an uair sin +An uair sin +an uairsin +An uairsin +an uirigh +An uirigh +an àird +An àird +an àirde +An àirde +an ìre mhath +An ìre mhath +ana nàdarra +Ana nàdarra +ann a +Ann a +ann a sheo +Ann a sheo +ann a sheothach +Ann a sheothach +ann a shin +Ann a shin +ann a shineach +Ann a shineach +ann a shiodach +Ann a shiodach +ann a shiud +Ann a shiud +ann a shiudach +Ann a shiudach +ann a' +Ann a' +ann a' shiudach +Ann a' shiudach +ann a-seo +Ann a-seo +ann a-seothach +Ann a-seothach +ann a-sin +Ann a-sin +ann a-sineach +Ann a-sineach +ann a-siud +Ann a-siud +ann am +Ann am +ann an +Ann an +ann an seo +Ann an seo +ann an shin +Ann an shin +ann an shiud +Ann an shiud +ann an sin +Ann an sin +ann an siud +Ann an siud +ann seo +Ann seo +anns a' bhad +Anns a' bhad +anns an +Anns an +ath-oidhch' +Ath-oidhch' +ban-righ 'nn +Ban-righ 'nn +bho thoiseach +Bho thoiseach +bhon 'n +Bhon 'n +bhon a' +Bhon a' +bhon an +Bhon an +bhrist ' +Bhrist ' +buille a-mach +Buille a-mach +bun os cionn +Bun os cionn +car son +Car son +ceann a tuath +Ceann a tuath +cia mheud +Cia mheud +coille chaoil +Coille chaoil +cò mheud +Cò mheud +có dhiubh +Có dhiubh +d' rachadh +D' rachadh +dhen an +Dhen an +do n +Do n +dè mar +Dè mar +dé mar +Dé mar +eilean tiridhe +Eilean tiridhe +fa leth +Fa leth +fad as +Fad as +fo dheireadh +Fo dheireadh +fon a' +Fon a' +fon an +Fon an +gar bith +Gar bith +gar bith có +Gar bith có +ge 's bith +Ge 's bith +ge b' e air bith +Ge b' e air bith +ge be +Ge be +ge brith +Ge brith +ge brì +Ge brì +gleann dail +Gleann dail +gleann ois +Gleann ois +gu bè gu dè +Gu bè gu dè +gu dè +Gu dè +gu dé +Gu dé +gu ruige +Gu ruige +ho ro gheallaidh +Ho ro gheallaidh +ma dheireadh +Ma dheireadh +ma dheireadh thall +Ma dheireadh thall +ma sgaoil +Ma sgaoil +ma tha +Ma tha +mar an ceudna +Mar an ceudna +mar bu trice +Mar bu trice +mar tha +Mar tha +meadhan aoiseil +Meadhan aoiseil +mu 'n +Mu 'n +mu chuairt +Mu chuairt +mu dheas +Mu dheas +mu dheireadh +Mu dheireadh +mu dheireadh thall +Mu dheireadh thall +mu n +Mu n +mu thràth +Mu thràth +mun a' +Mun a' +mun an +Mun an +na b' +Na b' +na bu +Na bu +na iad +Na iad +nach maireann +Nach maireann +o'n uairsin +O'n uairsin +oidhch ' +Oidhch ' +on a' +On a' +on an +On an +pholl a' ghrùthain +Pholl a' ghrùthain +roinn eorpa +Roinn eorpa +ron a' +Ron a' +ron an +Ron an +ruaidh mhònaidh +Ruaidh mhònaidh +ruith thairis +Ruith thairis +sa bhad +Sa bhad +sadadh a-mach +Sadadh a-mach +sadadh a-steach +Sadadh a-steach +sam bidh +Sam bidh +sam bith +Sam bith +srath chluaidh +Srath chluaidh +taobh a-muigh +Taobh a-muigh +taobh an ear +Taobh an ear +taobh an iar +Taobh an iar +tria san ngaoidhilcc nalbanaigh +Tria san ngaoidhilcc nalbanaigh +tron a' +Tron a' +tron an +Tron an +tuilleadh 's a chòir +Tuilleadh 's a chòir +tuilleadh sa chòir +Tuilleadh sa chòir""".split( + "\n" +): + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py new file mode 100644 index 000000000..eee9e69d0 --- /dev/null +++ b/spacy/lang/kmr/__init__.py @@ -0,0 +1,16 @@ +from ...language import BaseDefaults, Language +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS + + +class KurmanjiDefaults(BaseDefaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Kurmanji(Language): + lang = "kmr" + Defaults = KurmanjiDefaults + + +__all__ = ["Kurmanji"] diff --git a/spacy/lang/kmr/examples.py b/spacy/lang/kmr/examples.py new file mode 100644 index 000000000..5eb362001 --- /dev/null +++ b/spacy/lang/kmr/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.kmr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future + "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years. + "Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist + "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years + "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation + "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide + "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition + "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me +] diff --git a/spacy/lang/kmr/lex_attrs.py b/spacy/lang/kmr/lex_attrs.py new file mode 100644 index 000000000..6b8020410 --- /dev/null +++ b/spacy/lang/kmr/lex_attrs.py @@ -0,0 +1,138 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "sifir", + "yek", + "du", + "sê", + "çar", + "pênc", + "şeş", + "heft", + "heşt", + "neh", + "deh", + "yazde", + "dazde", + "sêzde", + "çarde", + "pazde", + "şazde", + "hevde", + "hejde", + "nozde", + "bîst", + "sî", + "çil", + "pêncî", + "şêst", + "heftê", + "heştê", + "nod", + "sed", + "hezar", + "milyon", + "milyar", +] + +_ordinal_words = [ + "yekem", + "yekemîn", + "duyem", + "duyemîn", + "sêyem", + "sêyemîn", + "çarem", + "çaremîn", + "pêncem", + "pêncemîn", + "şeşem", + "şeşemîn", + "heftem", + "heftemîn", + "heştem", + "heştemîn", + "nehem", + "nehemîn", + "dehem", + "dehemîn", + "yazdehem", + "yazdehemîn", + "dazdehem", + "dazdehemîn", + "sêzdehem", + "sêzdehemîn", + "çardehem", + "çardehemîn", + "pazdehem", + "pazdehemîn", + "şanzdehem", + "şanzdehemîn", + "hevdehem", + "hevdehemîn", + "hejdehem", + "hejdehemîn", + "nozdehem", + "nozdehemîn", + "bîstem", + "bîstemîn", + "sîyem", + "sîyemîn", + "çilem", + "çilemîn", + "pêncîyem", + "pênciyemîn", + "şêstem", + "şêstemîn", + "heftêyem", + "heftêyemîn", + "heştêyem", + "heştêyemîn", + "notem", + "notemîn", + "sedem", + "sedemîn", + "hezarem", + "hezaremîn", + "milyonem", + "milyonemîn", + "milyarem", + "milyaremîn", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + + # Check ordinal number + if text_lower in _ordinal_words: + return True + + if is_digit(text_lower): + return True + + return False + + +def is_digit(text): + endings = ("em", "yem", "emîn", "yemîn") + for ending in endings: + to = len(ending) + if text.endswith(ending) and text[:-to].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py new file mode 100644 index 000000000..aee33c2b7 --- /dev/null +++ b/spacy/lang/kmr/stop_words.py @@ -0,0 +1,44 @@ +STOP_WORDS = set( + """ +û +li +bi +di +da +de +ji +ku +ew +ez +tu +em +hûn +ew +ev +min +te +wî +wê +me +we +wan +vê +vî +va +çi +kî +kê +çawa +çima +kengî +li ku +çend +çiqas +her +hin +gelek +hemû +kes +tişt +""".split() +) diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py index 413f0038d..9470088a1 100644 --- a/spacy/lang/mk/__init__.py +++ b/spacy/lang/mk/__init__.py @@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS - @classmethod - def create_lemmatizer(cls, nlp=None, lookups=None): - if lookups is None: - lookups = Lookups() - return MacedonianLemmatizer(lookups) - - class Macedonian(Language): lang = "mk" Defaults = MacedonianDefaults diff --git a/spacy/language.py b/spacy/language.py index 18d20c939..0d9aab9e3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -5,7 +5,7 @@ import multiprocessing as mp import random import traceback import warnings -from contextlib import contextmanager +from contextlib import ExitStack, contextmanager from copy import deepcopy from dataclasses import dataclass from itertools import chain, cycle @@ -31,6 +31,7 @@ from typing import ( ) import srsly +from cymem.cymem import Pool from thinc.api import Config, CupyOps, Optimizer, get_current_ops from . import about, ty, util @@ -2091,6 +2092,38 @@ class Language: util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined] tok2vec.remove_listener(listener, pipe_name) + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: + """Begin a block where all resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + + Example + ------- + >>> with nlp.memory_zone(): + ... for doc in nlp.pipe(texts): + ... process_my_doc(doc) + >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone + """ + if mem is None: + mem = Pool() + # The ExitStack allows programmatic nested context managers. + # We don't know how many we need, so it would be awkward to have + # them as nested blocks. + with ExitStack() as stack: + contexts = [stack.enter_context(self.vocab.memory_zone(mem))] + if hasattr(self.tokenizer, "memory_zone"): + contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem))) + for _, pipe in self.pipeline: + if hasattr(pipe, "memory_zone"): + contexts.append(stack.enter_context(pipe.memory_zone(mem))) + yield mem + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index e13754944..bedaaf9fe 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -203,7 +203,7 @@ cdef class ArcEagerGold: def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) - labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels] + labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels] sent_starts = _get_aligned_sent_starts(example) assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx index 7de19851e..9e3a21b81 100644 --- a/spacy/pipeline/_parser_internals/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc): new_label, head_label = label.split(DELIMITER) new_head = _find_new_head(doc[i], head_label) doc.c[i].head = new_head.i - i - doc.c[i].dep = doc.vocab.strings.add(new_label) + doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False) set_children_from_heads(doc.c, 0, doc.length) return doc diff --git a/spacy/strings.pxd b/spacy/strings.pxd index d22f48ba1..b01585858 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -25,5 +25,7 @@ cdef class StringStore: cdef vector[hash_t] keys cdef public PreshMap _map - cdef const Utf8Str* intern_unicode(self, str py_string) - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) + cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient) + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) + cdef vector[hash_t] _transient_keys + cdef Pool _non_temp_mem diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 376a13175..b0f6cf5aa 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,9 +1,14 @@ # cython: infer_types=True # cython: profile=False cimport cython + +from contextlib import contextmanager +from typing import Iterator, List, Optional + from libc.stdint cimport uint32_t from libc.string cimport memcpy from murmurhash.mrmr cimport hash32, hash64 +from preshed.maps cimport map_clear import srsly @@ -31,7 +36,7 @@ def get_string_id(key): This function optimises for convenience over performance, so shouldn't be used in tight loops. """ - cdef hash_t str_hash + cdef hash_t str_hash if isinstance(key, str): if len(key) == 0: return 0 @@ -45,8 +50,8 @@ def get_string_id(key): elif _try_coerce_to_hash(key, &str_hash): # Coerce the integral key to the expected primitive hash type. # This ensures that custom/overloaded "primitive" data types - # such as those implemented by numpy are not inadvertently used - # downsteam (as these are internally implemented as custom PyObjects + # such as those implemented by numpy are not inadvertently used + # downsteam (as these are internally implemented as custom PyObjects # whose comparison operators can incur a significant overhead). return str_hash else: @@ -119,10 +124,11 @@ cdef class StringStore: strings (iterable): A sequence of unicode strings to add to the store. """ self.mem = Pool() + self._non_temp_mem = self.mem self._map = PreshMap() if strings is not None: for string in strings: - self.add(string) + self.add(string, allow_transient=False) def __getitem__(self, object string_or_id): """Retrieve a string from a given hash, or vice versa. @@ -152,14 +158,17 @@ cdef class StringStore: return SYMBOLS_BY_INT[str_hash] else: utf8str = self._map.get(str_hash) + if utf8str is NULL: + raise KeyError(Errors.E018.format(hash_value=string_or_id)) + else: + return decode_Utf8Str(utf8str) else: # TODO: Raise an error instead utf8str = self._map.get(string_or_id) - - if utf8str is NULL: - raise KeyError(Errors.E018.format(hash_value=string_or_id)) - else: - return decode_Utf8Str(utf8str) + if utf8str is NULL: + raise KeyError(Errors.E018.format(hash_value=string_or_id)) + else: + return decode_Utf8Str(utf8str) def as_int(self, key): """If key is an int, return it; otherwise, get the int value.""" @@ -175,12 +184,46 @@ cdef class StringStore: else: return self[key] - def add(self, string): + def __len__(self) -> int: + """The number of strings in the store. + + RETURNS (int): The number of strings in the store. + """ + return self.keys.size() + self._transient_keys.size() + + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Pool: + """Begin a block where all resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + """ + if mem is None: + mem = Pool() + self.mem = mem + yield mem + for key in self._transient_keys: + map_clear(self._map.c_map, key) + self._transient_keys.clear() + self.mem = self._non_temp_mem + + def add(self, string: str, allow_transient: Optional[bool] = None) -> int: """Add a string to the StringStore. string (str): The string to add. + allow_transient (bool): Allow the string to be stored in the 'transient' + map, which will be flushed at the end of the memory zone. Strings + encountered during arbitrary text processing should be added + with allow_transient=True, while labels and other strings used + internally should not. RETURNS (uint64): The string's hash value. """ + if allow_transient is None: + allow_transient = self.mem is not self._non_temp_mem cdef hash_t str_hash if isinstance(string, str): if string in SYMBOLS_BY_STR: @@ -188,22 +231,26 @@ cdef class StringStore: string = string.encode("utf8") str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) + self._intern_utf8(string, len(string), &str_hash, allow_transient) elif isinstance(string, bytes): if string in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string] str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) + self._intern_utf8(string, len(string), &str_hash, allow_transient) else: raise TypeError(Errors.E017.format(value_type=type(string))) return str_hash def __len__(self): """The number of strings in the store. + if string in SYMBOLS_BY_STR: + return SYMBOLS_BY_STR[string] + else: + return self._intern_str(string, allow_transient) RETURNS (int): The number of strings in the store. """ - return self.keys.size() + return self.keys.size() + self._transient_keys.size() def __contains__(self, string_or_id not None): """Check whether a string or ID is in the store. @@ -222,12 +269,17 @@ cdef class StringStore: pass else: # TODO: Raise an error instead - return self._map.get(string_or_id) is not NULL - + if self._map.get(string_or_id) is not NULL: + return True + else: + return False if str_hash < len(SYMBOLS_BY_INT): return True else: - return self._map.get(str_hash) is not NULL + if self._map.get(str_hash) is not NULL: + return True + else: + return False def __iter__(self): """Iterate over the strings in the store, in order. @@ -240,12 +292,29 @@ cdef class StringStore: key = self.keys[i] utf8str = self._map.get(key) yield decode_Utf8Str(utf8str) - # TODO: Iterate OOV here? + for i in range(self._transient_keys.size()): + key = self._transient_keys[i] + utf8str = self._map.get(key) + yield decode_Utf8Str(utf8str) def __reduce__(self): strings = list(self) return (StringStore, (strings,), None, None, None) + def values(self) -> List[int]: + """Iterate over the stored strings hashes in insertion order. + + RETURNS: A list of string hashs. + """ + cdef int i + hashes = [None] * self._keys.size() + for i in range(self._keys.size()): + hashes[i] = self._keys[i] + transient_hashes = [None] * self._transient_keys.size() + for i in range(self._transient_keys.size()): + transient_hashes[i] = self._transient_keys[i] + return hashes + transient_hashes + def to_disk(self, path): """Save the current state to a directory. @@ -269,7 +338,7 @@ cdef class StringStore: prev = list(self) self._reset_and_load(strings) for word in prev: - self.add(word) + self.add(word, allow_transient=False) return self def to_bytes(self, **kwargs): @@ -289,23 +358,25 @@ cdef class StringStore: prev = list(self) self._reset_and_load(strings) for word in prev: - self.add(word) + self.add(word, allow_transient=False) return self def _reset_and_load(self, strings): self.mem = Pool() + self._non_temp_mem = self.mem self._map = PreshMap() self.keys.clear() + self._transient_keys.clear() for string in strings: - self.add(string) + self.add(string, allow_transient=False) - cdef const Utf8Str* intern_unicode(self, str py_string): + cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode("utf8") - return self._intern_utf8(byte_string, len(byte_string), NULL) + return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient) @cython.final - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash): + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient): # TODO: This function's API/behaviour is an unholy mess... # 0 means missing, but we don't bother offsetting the index. cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length) @@ -314,5 +385,8 @@ cdef class StringStore: return value value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) - self.keys.push_back(key) + if allow_transient and self.mem is not self._non_temp_mem: + self._transient_keys.push_back(key) + else: + self.keys.push_back(key) return value diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 7db986ab9..e30300a33 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -81,6 +81,11 @@ def bn_tokenizer(): return get_lang_class("bn")().tokenizer +@pytest.fixture(scope="session") +def bo_tokenizer(): + return get_lang_class("bo")().tokenizer + + @pytest.fixture(scope="session") def ca_tokenizer(): return get_lang_class("ca")().tokenizer diff --git a/spacy/tests/lang/bo/__init__.py b/spacy/tests/lang/bo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/bo/test_text.py b/spacy/tests/lang/bo/test_text.py new file mode 100644 index 000000000..fb3900d51 --- /dev/null +++ b/spacy/tests/lang/bo/test_text.py @@ -0,0 +1,21 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("999.0", True), + ("གཅིག་", True), + ("གཉིས་", True), + ("ཀླད་ཀོར་", True), + ("བཅུ་གཅིག་", True), + ("ཁྱི་", False), + (",", False), + ], +) +def test_lex_attrs_like_number(bo_tokenizer, text, match): + tokens = bo_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/kmr/__init__.py b/spacy/tests/lang/kmr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/kmr/test_text.py b/spacy/tests/lang/kmr/test_text.py new file mode 100644 index 000000000..405dc28f6 --- /dev/null +++ b/spacy/tests/lang/kmr/test_text.py @@ -0,0 +1,27 @@ +import pytest + +from spacy.lang.kmr.lex_attrs import like_num + + +@pytest.mark.parametrize( + "word", + [ + "yekem", + "duyemîn", + "100em", + "dehem", + "sedemîn", + "34em", + "30yem", + "20emîn", + "50yemîn", + ], +) +def test_kmr_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + +@pytest.mark.parametrize("word", ["deh"]) +def test_kmr_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 8a158647a..9b9ca4834 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -10,7 +10,7 @@ LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] + "tr", "tt", "uk", "ur", "xx", "yo", "kmr"] # fmt: on diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 78932f653..f4752849f 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -18,6 +18,7 @@ LANGUAGES = [ pytest.param("ar", marks=pytest.mark.slow()), pytest.param("bg", marks=pytest.mark.slow()), "bn", + pytest.param("bo", marks=pytest.mark.slow()), pytest.param("ca", marks=pytest.mark.slow()), pytest.param("cs", marks=pytest.mark.slow()), pytest.param("da", marks=pytest.mark.slow()), @@ -57,6 +58,7 @@ LANGUAGES = [ pytest.param("tr", marks=pytest.mark.slow()), pytest.param("tt", marks=pytest.mark.slow()), pytest.param("ur", marks=pytest.mark.slow()), + pytest.param("kmr", marks=pytest.mark.slow()), ] diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py new file mode 100644 index 000000000..910d2664e --- /dev/null +++ b/spacy/tests/vocab_vectors/test_memory_zone.py @@ -0,0 +1,36 @@ +from spacy.vocab import Vocab + + +def test_memory_zone_no_insertion(): + vocab = Vocab() + with vocab.memory_zone(): + pass + lex = vocab["horse"] + assert lex.text == "horse" + + +def test_memory_zone_insertion(): + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab + with vocab.memory_zone(): + lex = vocab["horse"] + assert lex.text == "horse" + assert "dog" in vocab + assert "horse" not in vocab + + +def test_memory_zone_redundant_insertion(): + """Test that if we insert an already-existing word while + in the memory zone, it stays persistent""" + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab + with vocab.memory_zone(): + lex = vocab["horse"] + assert lex.text == "horse" + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a902ebad9..88e4b06b0 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -25,9 +25,7 @@ cdef class Tokenizer: cdef PhraseMatcher _special_matcher # TODO convert to bool in v4 cdef int _faster_heuristics - # TODO next one is unused and should be removed in v4 - # https://github.com/explosion/spaCy/pull/9150 - cdef int _unused_int2 + cdef public int max_cache_size cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 96545828f..6ca170dd4 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - url_match=None, faster_heuristics=True): + url_match=None, faster_heuristics=True, max_cache_size=10000): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -50,6 +50,7 @@ cdef class Tokenizer: faster_heuristics (bool): Whether to restrict the final Matcher-based pass for rules to those containing affixes or space. Defaults to True. + max_cache_size (int): Maximum number of tokenization chunks to cache. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -69,6 +70,7 @@ cdef class Tokenizer: self._rules = {} self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) + self.max_cache_size = max_cache_size @property def token_match(self): @@ -397,8 +399,9 @@ cdef class Tokenizer: has_special, with_special_cases) self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, with_special_cases) - self._save_cached(&tokens.c[orig_size], orig_key, has_special, - tokens.length - orig_size) + if len(self._cache) < self.max_cache_size: + self._save_cached(&tokens.c[orig_size], orig_key, has_special, + tokens.length - orig_size) cdef str _split_affixes( self, @@ -514,9 +517,8 @@ cdef class Tokenizer: if n <= 0: # avoid mem alloc of zero length return 0 - for i in range(n): - if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: - return 0 + if self.vocab.in_memory_zone: + return 0 # See #1250 if has_special[0]: return 0 diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 43e47af1d..c2bfe12e3 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -41,7 +41,9 @@ cdef class Vocab: cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL - cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1 cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth + cdef Pool _non_temp_mem + cdef vector[attr_t] _transient_orths diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index b7ff20348..ee7636f02 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -1,6 +1,8 @@ +from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union +from cymem.cymem import Pool from thinc.types import Floats1d, FloatsXd from . import Language @@ -67,6 +69,8 @@ class Vocab: def from_bytes( self, bytes_data: bytes, *, exclude: Iterable[str] = ... ) -> Vocab: ... + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ... def pickle_vocab(vocab: Vocab) -> Any: ... def unpickle_vocab( diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 19e6eb005..11043c17a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,8 +1,11 @@ import functools +from contextlib import ExitStack, contextmanager +from typing import Iterator, Optional import numpy import srsly from thinc.api import get_array_module, get_current_ops +from preshed.maps cimport map_clear from .attrs cimport LANG, ORTH from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme @@ -87,6 +90,12 @@ cdef class Vocab: self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks + # During a memory_zone we replace our mem object with one + # that's passed to us. We keep a reference to our non-temporary + # memory here, in case we need to make an allocation we want to + # guarantee is not temporary. This is also how we check whether + # we're in a memory zone: we check whether self.mem is self._non_temp_mem + self._non_temp_mem = self.mem @property def vectors(self): @@ -96,7 +105,7 @@ cdef class Vocab: def vectors(self, vectors): if hasattr(vectors, "strings"): for s in vectors.strings: - self.strings.add(s) + self.strings.add(s, allow_transient=False) self._vectors = vectors self._vectors.strings = self.strings @@ -107,6 +116,10 @@ cdef class Vocab: langfunc = self.lex_attr_getters.get(LANG, None) return langfunc("_") if langfunc else "" + @property + def in_memory_zone(self) -> bool: + return self.mem is not self._non_temp_mem + def __len__(self): """The current number of lexemes stored. @@ -114,6 +127,33 @@ cdef class Vocab: """ return self.length + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: + """Begin a block where resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + """ + if mem is None: + mem = Pool() + # The ExitStack allows programmatic nested context managers. + # We don't know how many we need, so it would be awkward to have + # them as nested blocks. + with ExitStack() as stack: + contexts = [stack.enter_context(self.strings.memory_zone(mem))] + if hasattr(self.morphology, "memory_zone"): + contexts.append(stack.enter_context(self.morphology.memory_zone(mem))) + if hasattr(self._vectors, "memory_zone"): + contexts.append(stack.enter_context(self._vectors.memory_zone(mem))) + self.mem = mem + yield mem + self._clear_transient_orths() + self.mem = self._non_temp_mem + def add_flag(self, flag_getter, int flag_id=-1): """Set a new boolean flag to words in the vocabulary. @@ -148,8 +188,7 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new - `Lexeme` if necessary using memory acquired from the given pool. If the - pool is the lexicon's own memory, the lexeme is saved in the lexicon. + `Lexeme` if necessary. """ if string == "": return &EMPTY_LEXEME @@ -180,19 +219,11 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: - # I think this heuristic is bad, and the Vocab should always - # own the lexemes. It avoids weird bugs this way, as it's how the thing - # was originally supposed to work. The best solution to the growing - # memory use is to periodically reset the vocab, which is an action - # that should be up to the user to do (so we don't need to keep track - # of the doc ownership). - # TODO: Change the C API so that the mem isn't passed in here. + # The mem argument is deprecated, replaced by memory zones. Same with + # this size heuristic. mem = self.mem - # if len(string) < 3 or self.length < 10000: - # mem = self.mem - cdef bint is_oov = mem is not self.mem lex = mem.alloc(1, sizeof(LexemeC)) - lex.orth = self.strings.add(string) + lex.orth = self.strings.add(string, allow_transient=True) lex.length = len(string) if self.vectors is not None and hasattr(self.vectors, "key2row"): lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK) @@ -202,18 +233,25 @@ cdef class Vocab: for attr, func in self.lex_attr_getters.items(): value = func(string) if isinstance(value, str): - value = self.strings.add(value) + value = self.strings.add(value, allow_transient=True) if value is not None: Lexeme.set_struct_attr(lex, attr, value) - if not is_oov: - self._add_lex_to_vocab(lex.orth, lex) + self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex - cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1: self._by_orth.set(lex.orth, lex) self.length += 1 + if is_transient and self.in_memory_zone: + self._transient_orths.push_back(lex.orth) + + def _clear_transient_orths(self): + """Remove transient lexemes from the index (generally at the end of the memory zone)""" + for orth in self._transient_orths: + map_clear(self._by_orth.c_map, orth) + self._transient_orths.clear() def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. @@ -265,7 +303,7 @@ cdef class Vocab: """ cdef attr_t orth if isinstance(id_or_string, str): - orth = self.strings.add(id_or_string) + orth = self.strings.add(id_or_string, allow_transient=True) else: orth = id_or_string return Lexeme(self, orth) @@ -417,7 +455,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#get_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=True) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) if self.has_vector(key): @@ -436,7 +474,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#set_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=False) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) if self.vectors.is_full and key not in self.vectors: @@ -460,7 +498,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#has_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=True) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) return key in self.vectors diff --git a/website/meta/languages.json b/website/meta/languages.json index d6a078097..a824b7d7c 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -31,6 +31,12 @@ "name": "Bengali", "has_examples": true }, + { + "code": "bo", + "name": "Tibetan", + "example": "འདི་ཚིག་གྲུབ་རེད།", + "has_examples": true + }, { "code": "ca", "name": "Catalan", @@ -480,6 +486,12 @@ ], "example": "这是一个用于示例的句子。", "has_examples": true + }, + { + "code": "kmr", + "name": "Kurdish Kurmanji", + "example": "Ev hevokek e", + "has_examples": true } ], "licenses": [ diff --git a/website/meta/universe.json b/website/meta/universe.json index 8d46fac8e..ec8887276 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -16,6 +16,40 @@ }, "category": ["extension"], "tags": [] + }, + { + "id": "constituent_treelib", + "title": "Constituent Treelib", + "slogan": "Extract constituents with ease!", + "description": "Constituent Treelib (CTL) is a lightweight Python library built on top of benepar (Berkeley Neural Parser) as well as the two well-known NLP frameworks spaCy and NLTK. CTL offers you a convenient way to parse sentences into constituent trees, modify them according to their structure, as well as visualize and export them into various file formats. In addition, you can extract phrases according to their phrasal categories (which can be used e.g., as features for various NLP tasks), validate already parsed sentences in bracket notation or convert them back into sentences.", + "github": "Halvani/Constituent-Treelib", + "pip": "constituent-treelib", + "code_example": [ + "from constituent_treelib import ConstituentTree, Language", + "# Define the language for the sentence as well as for the spaCy and benepar models", + "language = Language.English", + "# Define which specific SpaCy model should be used (default is Medium)", + "spacy_model_size = ConstituentTree.SpacyModelSize.Medium", + "# Create the pipeline (note, the required models will be downloaded and installed automatically)", + "nlp = ConstituentTree.create_pipeline(language, spacy_model_size)", + "# Your sentence", + "sentence = 'We try to explicitly describe the geometry of the edges of the images.'", + "# Create the tree from where we are going to extract the desired noun phrases", + "tree = ConstituentTree(sentence, nlp)", + "all_phrases = tree.extract_all_phrases(min_words_in_phrases=1)", + "print(all_phrases)", + "# {'PP': ['of the edges of the images', 'of the images'], 'NP': ['We', 'the geometry of the edges of the images', 'the geometry', 'the edges of the images', 'the edges', 'the images'], 'S': ['We try to explicitly describe the geometry of the edges of the images .', 'to explicitly describe the geometry of the edges of the images'], 'VP': ['try to explicitly describe the geometry of the edges of the images', 'to explicitly describe the geometry of the edges of the images', 'describe the geometry of the edges of the images'], 'ADVP': ['explicitly']}" + ], + "code_language": "python", + "url": "https://github.com/Halvani/Constituent-Treelib", + "thumb": "https://github.com/Halvani/Constituent-Treelib/blob/main/assets/images/promo_tree.svg", + "author": "Oren Halvani", + "author_links": { + "github": "Halvani", + "website": "https://www.linkedin.com/in/orenhalvani" + }, + "category": ["apis", "standalone", "visualizers"], + "tags": ["apis", "deployment", "constituency ", "parsing"] }, { "id": "sayswho", @@ -4537,6 +4571,33 @@ }, "category": ["pipeline"], "tags": ["tokenizer", "french"] + }, + { + "id": "gliner-spacy", + "title": "GLiNER spaCy Wrapper", + "slogan": "Integrating GLiNER's Advanced NER with spaCy", + "description": "GLiNER SpaCy Wrapper is a project that brings together GLiNER, a zero-shot Named Entity Recognition (NER) model, with spaCy's NLP capabilities. It provides an easy way to integrate GLiNER within the spaCy environment, thus enhancing NER tasks with GLiNER's features.", + "github": "theirstory/gliner-spacy", + "pip": "gliner-spacy", + "code_example": [ + "import spacy", + "", + "nlp = spacy.blank('en')", + "nlp.add_pipe('gliner_spacy')", + "text = 'This is a text about Bill Gates and Microsoft.'", + "doc = nlp(text)", + "", + "for ent in doc.ents:", + " print(ent.text, ent.label_)" + ], + "code_language": "python", + "url": "https://github.com/theirstory/gliner-spacy", + "author": "TheirStory", + "author_links": { + "website": "https://theirstory.io" + }, + "category": ["pipeline"], + "tags": ["NER"] } ], diff --git a/website/src/templates/index.js b/website/src/templates/index.js index fad12f4c8..754cf47bf 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => { } const navAlert = ( - - 💥 Interested in Premium spaCy Models? + + 💥 New: Case study with S&P Global )