From dbc71ecd44146c260ce6b0a9047e525e7ed8680a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 4 May 2023 17:13:12 +0200
Subject: [PATCH 1/7] Remove #egg from download URLs (#12567)

The current URLs will become invalid in pip 25.0. According to the pip
docs, the egg= URLs are currently only needed for editable VCS installs.
---
 spacy/cli/download.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0c9a32b93..df4bca53d 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -81,11 +81,8 @@ def download(
 
 def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
     dl_tpl = "{m}-{v}/{m}-{v}{s}"
-    egg_tpl = "#egg={m}=={v}"
     suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
     filename = dl_tpl.format(m=model_name, v=version, s=suffix)
-    if sdist:
-        filename += egg_tpl.format(m=model_name, v=version)
     return filename
 
 

From fbd12eb4a4c75f193f2badbeebf8967ac52350ef Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 8 May 2023 09:10:35 +0200
Subject: [PATCH 2/7] Set version to v3.6.0.dev0

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 640e9e93b..c6b09039e 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.5.0"
+__version__ = "3.6.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 46ce66021a1f6c6f18914546051199b478e63040 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 8 May 2023 09:17:33 +0200
Subject: [PATCH 3/7] Temporarily skip download CLI related tests in CI

---
 .github/workflows/tests.yml | 54 ++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 21b660989..619570090 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -107,22 +107,22 @@ jobs:
       - name: Test import
         run: python -W error -c "import spacy"
 
-      - name: "Test download CLI"
-        run: |
-          python -m spacy download ca_core_news_sm
-          python -m spacy download ca_core_news_md
-          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
-
-      - name: "Test download_url in info CLI"
-        run: |
-          python -W error -m spacy info ca_core_news_sm | grep -q download_url
-        if: matrix.python_version == '3.9'
-
-      - name: "Test no warnings on load (#11713)"
-        run: |
-          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-        if: matrix.python_version == '3.9'
+#      - name: "Test download CLI"
+#        run: |
+#          python -m spacy download ca_core_news_sm
+#          python -m spacy download ca_core_news_md
+#          python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test download_url in info CLI"
+#        run: |
+#          python -W error -m spacy info ca_core_news_sm | grep -q download_url
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test no warnings on load (#11713)"
+#        run: |
+#          python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+#        if: matrix.python_version == '3.9'
 
       - name: "Test convert CLI"
         run: |
@@ -146,17 +146,17 @@ jobs:
           python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
         if: matrix.python_version == '3.9'
 
-      - name: "Test assemble CLI"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-        if: matrix.python_version == '3.9'
-
-      - name: "Test assemble CLI vectors warning"
-        run: |
-          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-        if: matrix.python_version == '3.9'
+#      - name: "Test assemble CLI"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+#          PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+#        if: matrix.python_version == '3.9'
+#
+#      - name: "Test assemble CLI vectors warning"
+#        run: |
+#          python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+#          python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+#        if: matrix.python_version == '3.9'
 
       - name: "Install test requirements"
         run: |

From 6f314f99c42c3503b89391798a58befbbc23bee4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 8 May 2023 12:33:56 +0200
Subject: [PATCH 4/7] Use Latin normalization for Serbian attrs (#12608)

* Use Latin normalization for Serbian attrs

Use Latin normalization for Serbian `NORM`, `PREFIX`, and `SUFFIX`.

* Update NORMs in tokenizer exceptions and related tests

* Add tests for all custom lex attrs

* Remove unused imports
---
 spacy/lang/sr/lex_attrs.py             | 55 +++++++++++++++++++++++++-
 spacy/lang/sr/tokenizer_exceptions.py  |  3 ++
 spacy/tests/lang/sr/test_exceptions.py | 12 +++---
 spacy/tests/lang/sr/test_lex_attrs.py  | 17 ++++++++
 4 files changed, 79 insertions(+), 8 deletions(-)
 create mode 100644 spacy/tests/lang/sr/test_lex_attrs.py

diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py
index dc48909bc..a356a6a7a 100644
--- a/spacy/lang/sr/lex_attrs.py
+++ b/spacy/lang/sr/lex_attrs.py
@@ -1,4 +1,4 @@
-from ...attrs import LIKE_NUM
+from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX
 
 
 _num_words = [
@@ -63,4 +63,55 @@ def like_num(text):
     return False
 
 
-LEX_ATTRS = {LIKE_NUM: like_num}
+def _cyr_to_latin_norm(text):
+    # fmt: off
+    # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py
+    SR_CYR_TO_LAT_DICT = {
+        u'А': u'A', u'а': u'a',
+        u'Б': u'B', u'б': u'b',
+        u'В': u'V', u'в': u'v',
+        u'Г': u'G', u'г': u'g',
+        u'Д': u'D', u'д': u'd',
+        u'Ђ': u'Đ', u'ђ': u'đ',
+        u'Е': u'E', u'е': u'e',
+        u'Ж': u'Ž', u'ж': u'ž',
+        u'З': u'Z', u'з': u'z',
+        u'И': u'I', u'и': u'i',
+        u'Ј': u'J', u'ј': u'j',
+        u'К': u'K', u'к': u'k',
+        u'Л': u'L', u'л': u'l',
+        u'Љ': u'Lj', u'љ': u'lj',
+        u'М': u'M', u'м': u'm',
+        u'Н': u'N', u'н': u'n',
+        u'Њ': u'Nj', u'њ': u'nj',
+        u'О': u'O', u'о': u'o',
+        u'П': u'P', u'п': u'p',
+        u'Р': u'R', u'р': u'r',
+        u'С': u'S', u'с': u's',
+        u'Т': u'T', u'т': u't',
+        u'Ћ': u'Ć', u'ћ': u'ć',
+        u'У': u'U', u'у': u'u',
+        u'Ф': u'F', u'ф': u'f',
+        u'Х': u'H', u'х': u'h',
+        u'Ц': u'C', u'ц': u'c',
+        u'Ч': u'Č', u'ч': u'č',
+        u'Џ': u'Dž', u'џ': u'dž',
+        u'Ш': u'Š', u'ш': u'š',
+    }
+    # fmt: on
+    return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text)
+
+
+def norm(text):
+    return _cyr_to_latin_norm(text).lower()
+
+
+def prefix(text):
+    return _cyr_to_latin_norm(text)[0]
+
+
+def suffix(text):
+    return _cyr_to_latin_norm(text)[-3:]
+
+
+LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix}
diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py
index dcaa3e239..053306088 100755
--- a/spacy/lang/sr/tokenizer_exceptions.py
+++ b/spacy/lang/sr/tokenizer_exceptions.py
@@ -1,3 +1,4 @@
+from .lex_attrs import _cyr_to_latin_norm
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...symbols import ORTH, NORM
 from ...util import update_exc
@@ -89,5 +90,7 @@ _slang_exc = [
 for slang_desc in _slang_exc:
     _exc[slang_desc[ORTH]] = [slang_desc]
 
+for _exc_key in _exc:
+    _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
 
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py
index fa92e5e2d..e8819e628 100644
--- a/spacy/tests/lang/sr/test_exceptions.py
+++ b/spacy/tests/lang/sr/test_exceptions.py
@@ -2,15 +2,15 @@ import pytest
 
 
 @pytest.mark.parametrize(
-    "text,norms,lemmas",
+    "text,norms",
     [
-        ("о.г.", ["ове године"], ["ова година"]),
-        ("чет.", ["четвртак"], ["четвртак"]),
-        ("гђа", ["госпођа"], ["госпођа"]),
-        ("ил'", ["или"], ["или"]),
+        ("о.г.", ["ove godine"]),
+        ("чет.", ["četvrtak"]),
+        ("гђа", ["gospođa"]),
+        ("ил'", ["ili"]),
     ],
 )
-def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
+def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
     tokens = sr_tokenizer(text)
     assert len(tokens) == 1
     assert [token.norm_ for token in tokens] == norms
diff --git a/spacy/tests/lang/sr/test_lex_attrs.py b/spacy/tests/lang/sr/test_lex_attrs.py
new file mode 100644
index 000000000..4a8039df5
--- /dev/null
+++ b/spacy/tests/lang/sr/test_lex_attrs.py
@@ -0,0 +1,17 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "text,like_num,norm,prefix,suffix",
+    [
+        ("нула", True, "nula", "n", "ula"),
+        ("Казна", False, "kazna", "K", "zna"),
+    ],
+)
+def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix):
+    tokens = sr_tokenizer(text)
+    assert len(tokens) == 1
+    assert tokens[0].like_num == like_num
+    assert tokens[0].norm_ == norm
+    assert tokens[0].prefix_ == prefix
+    assert tokens[0].suffix_ == suffix

From 1279b464bb2868b027b3690a9329091e153e9f23 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 8 May 2023 16:51:58 +0200
Subject: [PATCH 5/7] In initialize only calculate current vectors hash if
 needed (#12607)

---
 spacy/training/initialize.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index e90617852..9cf759c55 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -133,10 +133,11 @@ def init_vocab(
         logger.info("Added vectors: %s", vectors)
     # warn if source model vectors are not identical
     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
-    vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
-    for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
-        if vectors_hash != sourced_vectors_hash:
-            warnings.warn(Warnings.W113.format(name=sourced_component))
+    if len(sourced_vectors_hashes) > 0:
+        vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
+        for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
+            if vectors_hash != sourced_vectors_hash:
+                warnings.warn(Warnings.W113.format(name=sourced_component))
     logger.info("Finished initializing nlp object")
 
 

From eb3960a15abcf1a77a034bc5f22bea153f428de6 Mon Sep 17 00:00:00 2001
From: "Patrick J. Burns" <diyclassics@users.noreply.github.com>
Date: Tue, 9 May 2023 06:02:45 -0400
Subject: [PATCH 6/7] Add LatinCy models to universe.json (#12597)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add LatinCy models to universe.json

* Update website/meta/universe.json

Add install code for LatinCy models to 'code_example'

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update LatinCy ‘code_example’ in website/meta/universe.json

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

---------

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 4067c4d1e..05877cfc6 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,32 @@
 {
     "resources": [
+        {
+            "id": "latincy",
+            "title": "LatinCy",
+            "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
+            "slogan": "Synthetic trained spaCy pipelines for Latin NLP",
+            "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
+            "url": "https://huggingface.co/latincy",
+            "code_example": [
+                "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
+                "import spacy",
+                "nlp = spacy.load('la_core_web_lg')",
+                "doc = nlp('Haec narranatur a poetis de Perseo')",
+                "",
+                "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
+                "",
+                "# > Haec, haec, hic, DET"
+            ],
+            "code_language": "python",
+            "author": "Patrick J. Burns",
+            "author_links": {
+                "twitter": "@diyclassics",
+                "github": "diyclassics",
+                "website": "https://diyclassics.github.io/"
+            },
+            "category": ["pipeline", "research"],
+            "tags": ["latin"]
+        },   
         {
             "id": "spacy-wasm",
             "title": "spacy-wasm",

From 15f16db6ca3fd10a9667358d2f7b7e6eaf967e0a Mon Sep 17 00:00:00 2001
From: "Patrick J. Burns" <diyclassics@users.noreply.github.com>
Date: Tue, 9 May 2023 09:52:34 -0400
Subject: [PATCH 7/7] Fix typo (#12615)

---
 website/meta/universe.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 05877cfc6..b39ebb528 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -11,7 +11,7 @@
                 "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
                 "import spacy",
                 "nlp = spacy.load('la_core_web_lg')",
-                "doc = nlp('Haec narranatur a poetis de Perseo')",
+                "doc = nlp('Haec narrantur a poetis de Perseo')",
                 "",
                 "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
                 "",