From dbc71ecd44146c260ce6b0a9047e525e7ed8680a Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 4 May 2023 17:13:12 +0200 Subject: [PATCH 1/7] Remove #egg from download URLs (#12567) The current URLs will become invalid in pip 25.0. According to the pip docs, the egg= URLs are currently only needed for editable VCS installs. --- spacy/cli/download.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0c9a32b93..df4bca53d 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -81,11 +81,8 @@ def download( def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: dl_tpl = "{m}-{v}/{m}-{v}{s}" - egg_tpl = "#egg={m}=={v}" suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX filename = dl_tpl.format(m=model_name, v=version, s=suffix) - if sdist: - filename += egg_tpl.format(m=model_name, v=version) return filename From fbd12eb4a4c75f193f2badbeebf8967ac52350ef Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 09:10:35 +0200 Subject: [PATCH 2/7] Set version to v3.6.0.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 640e9e93b..c6b09039e 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.5.0" +__version__ = "3.6.0.dev0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" From 46ce66021a1f6c6f18914546051199b478e63040 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 09:17:33 +0200 Subject: [PATCH 3/7] Temporarily skip download CLI related tests in CI --- .github/workflows/tests.yml | 54 ++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 21b660989..619570090 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -107,22 +107,22 @@ jobs: - name: Test import run: python -W error -c "import spacy" - - name: "Test download CLI" - run: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md - python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - if: matrix.python_version == '3.9' - - - name: "Test download_url in info CLI" - run: | - python -W error -m spacy info ca_core_news_sm | grep -q download_url - if: matrix.python_version == '3.9' - - - name: "Test no warnings on load (#11713)" - run: | - python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" - if: matrix.python_version == '3.9' +# - name: "Test download CLI" +# run: | +# python -m spacy download ca_core_news_sm +# python -m spacy download ca_core_news_md +# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" +# if: matrix.python_version == '3.9' +# +# - name: "Test download_url in info CLI" +# run: | +# python -W error -m spacy info ca_core_news_sm | grep -q download_url +# if: matrix.python_version == '3.9' +# +# - name: "Test no warnings on load (#11713)" +# run: | +# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" +# if: matrix.python_version == '3.9' - name: "Test convert CLI" run: | @@ -146,17 +146,17 @@ jobs: python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 if: matrix.python_version == '3.9' - - name: "Test assemble CLI" - run: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - if: matrix.python_version == '3.9' - - - name: "Test assemble CLI vectors warning" - run: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" - python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - if: matrix.python_version == '3.9' +# - name: "Test assemble CLI" +# run: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" +# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir +# if: matrix.python_version == '3.9' +# +# - name: "Test assemble CLI vectors warning" +# run: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" +# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 +# if: matrix.python_version == '3.9' - name: "Install test requirements" run: | From 6f314f99c42c3503b89391798a58befbbc23bee4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 12:33:56 +0200 Subject: [PATCH 4/7] Use Latin normalization for Serbian attrs (#12608) * Use Latin normalization for Serbian attrs Use Latin normalization for Serbian `NORM`, `PREFIX`, and `SUFFIX`. * Update NORMs in tokenizer exceptions and related tests * Add tests for all custom lex attrs * Remove unused imports --- spacy/lang/sr/lex_attrs.py | 55 +++++++++++++++++++++++++- spacy/lang/sr/tokenizer_exceptions.py | 3 ++ spacy/tests/lang/sr/test_exceptions.py | 12 +++--- spacy/tests/lang/sr/test_lex_attrs.py | 17 ++++++++ 4 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/lang/sr/test_lex_attrs.py diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index dc48909bc..a356a6a7a 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,4 +1,4 @@ -from ...attrs import LIKE_NUM +from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX _num_words = [ @@ -63,4 +63,55 @@ def like_num(text): return False -LEX_ATTRS = {LIKE_NUM: like_num} +def _cyr_to_latin_norm(text): + # fmt: off + # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py + SR_CYR_TO_LAT_DICT = { + u'А': u'A', u'а': u'a', + u'Б': u'B', u'б': u'b', + u'В': u'V', u'в': u'v', + u'Г': u'G', u'г': u'g', + u'Д': u'D', u'д': u'd', + u'Ђ': u'Đ', u'ђ': u'đ', + u'Е': u'E', u'е': u'e', + u'Ж': u'Ž', u'ж': u'ž', + u'З': u'Z', u'з': u'z', + u'И': u'I', u'и': u'i', + u'Ј': u'J', u'ј': u'j', + u'К': u'K', u'к': u'k', + u'Л': u'L', u'л': u'l', + u'Љ': u'Lj', u'љ': u'lj', + u'М': u'M', u'м': u'm', + u'Н': u'N', u'н': u'n', + u'Њ': u'Nj', u'њ': u'nj', + u'О': u'O', u'о': u'o', + u'П': u'P', u'п': u'p', + u'Р': u'R', u'р': u'r', + u'С': u'S', u'с': u's', + u'Т': u'T', u'т': u't', + u'Ћ': u'Ć', u'ћ': u'ć', + u'У': u'U', u'у': u'u', + u'Ф': u'F', u'ф': u'f', + u'Х': u'H', u'х': u'h', + u'Ц': u'C', u'ц': u'c', + u'Ч': u'Č', u'ч': u'č', + u'Џ': u'Dž', u'џ': u'dž', + u'Ш': u'Š', u'ш': u'š', + } + # fmt: on + return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text) + + +def norm(text): + return _cyr_to_latin_norm(text).lower() + + +def prefix(text): + return _cyr_to_latin_norm(text)[0] + + +def suffix(text): + return _cyr_to_latin_norm(text)[-3:] + + +LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix} diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index dcaa3e239..053306088 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,3 +1,4 @@ +from .lex_attrs import _cyr_to_latin_norm from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc @@ -89,5 +90,7 @@ _slang_exc = [ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] +for _exc_key in _exc: + _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM]) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index fa92e5e2d..e8819e628 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -2,15 +2,15 @@ import pytest @pytest.mark.parametrize( - "text,norms,lemmas", + "text,norms", [ - ("о.г.", ["ове године"], ["ова година"]), - ("чет.", ["четвртак"], ["четвртак"]), - ("гђа", ["госпођа"], ["госпођа"]), - ("ил'", ["или"], ["или"]), + ("о.г.", ["ove godine"]), + ("чет.", ["četvrtak"]), + ("гђа", ["gospođa"]), + ("ил'", ["ili"]), ], ) -def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): +def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms): tokens = sr_tokenizer(text) assert len(tokens) == 1 assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/sr/test_lex_attrs.py b/spacy/tests/lang/sr/test_lex_attrs.py new file mode 100644 index 000000000..4a8039df5 --- /dev/null +++ b/spacy/tests/lang/sr/test_lex_attrs.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.mark.parametrize( + "text,like_num,norm,prefix,suffix", + [ + ("нула", True, "nula", "n", "ula"), + ("Казна", False, "kazna", "K", "zna"), + ], +) +def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix): + tokens = sr_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == like_num + assert tokens[0].norm_ == norm + assert tokens[0].prefix_ == prefix + assert tokens[0].suffix_ == suffix From 1279b464bb2868b027b3690a9329091e153e9f23 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 8 May 2023 16:51:58 +0200 Subject: [PATCH 5/7] In initialize only calculate current vectors hash if needed (#12607) --- spacy/training/initialize.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e90617852..9cf759c55 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -133,10 +133,11 @@ def init_vocab( logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) - vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) - for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): - if vectors_hash != sourced_vectors_hash: - warnings.warn(Warnings.W113.format(name=sourced_component)) + if len(sourced_vectors_hashes) > 0: + vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object") From eb3960a15abcf1a77a034bc5f22bea153f428de6 Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 9 May 2023 06:02:45 -0400 Subject: [PATCH 6/7] Add LatinCy models to universe.json (#12597) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add LatinCy models to universe.json * Update website/meta/universe.json Add install code for LatinCy models to 'code_example' Co-authored-by: Adriane Boyd * Update LatinCy ‘code_example’ in website/meta/universe.json Co-authored-by: Adriane Boyd --------- Co-authored-by: Adriane Boyd --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 4067c4d1e..05877cfc6 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1,5 +1,32 @@ { "resources": [ + { + "id": "latincy", + "title": "LatinCy", + "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png", + "slogan": "Synthetic trained spaCy pipelines for Latin NLP", + "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.", + "url": "https://huggingface.co/latincy", + "code_example": [ + "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", + "import spacy", + "nlp = spacy.load('la_core_web_lg')", + "doc = nlp('Haec narranatur a poetis de Perseo')", + "", + "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", + "", + "# > Haec, haec, hic, DET" + ], + "code_language": "python", + "author": "Patrick J. Burns", + "author_links": { + "twitter": "@diyclassics", + "github": "diyclassics", + "website": "https://diyclassics.github.io/" + }, + "category": ["pipeline", "research"], + "tags": ["latin"] + }, { "id": "spacy-wasm", "title": "spacy-wasm", From 15f16db6ca3fd10a9667358d2f7b7e6eaf967e0a Mon Sep 17 00:00:00 2001 From: "Patrick J. Burns" Date: Tue, 9 May 2023 09:52:34 -0400 Subject: [PATCH 7/7] Fix typo (#12615) --- website/meta/universe.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 05877cfc6..b39ebb528 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -11,7 +11,7 @@ "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", "import spacy", "nlp = spacy.load('la_core_web_lg')", - "doc = nlp('Haec narranatur a poetis de Perseo')", + "doc = nlp('Haec narrantur a poetis de Perseo')", "", "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", "",