diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 21b660989..619570090 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -107,22 +107,22 @@ jobs: - name: Test import run: python -W error -c "import spacy" - - name: "Test download CLI" - run: | - python -m spacy download ca_core_news_sm - python -m spacy download ca_core_news_md - python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" - if: matrix.python_version == '3.9' - - - name: "Test download_url in info CLI" - run: | - python -W error -m spacy info ca_core_news_sm | grep -q download_url - if: matrix.python_version == '3.9' - - - name: "Test no warnings on load (#11713)" - run: | - python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" - if: matrix.python_version == '3.9' +# - name: "Test download CLI" +# run: | +# python -m spacy download ca_core_news_sm +# python -m spacy download ca_core_news_md +# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" +# if: matrix.python_version == '3.9' +# +# - name: "Test download_url in info CLI" +# run: | +# python -W error -m spacy info ca_core_news_sm | grep -q download_url +# if: matrix.python_version == '3.9' +# +# - name: "Test no warnings on load (#11713)" +# run: | +# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" +# if: matrix.python_version == '3.9' - name: "Test convert CLI" run: | @@ -146,17 +146,17 @@ jobs: python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1 if: matrix.python_version == '3.9' - - name: "Test assemble CLI" - run: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" - PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir - if: matrix.python_version == '3.9' - - - name: "Test assemble CLI vectors warning" - run: | - python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" - python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 - if: matrix.python_version == '3.9' +# - name: "Test assemble CLI" +# run: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" +# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir +# if: matrix.python_version == '3.9' +# +# - name: "Test assemble CLI vectors warning" +# run: | +# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" +# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 +# if: matrix.python_version == '3.9' - name: "Install test requirements" run: | diff --git a/spacy/about.py b/spacy/about.py index 640e9e93b..c6b09039e 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.5.0" +__version__ = "3.6.0.dev0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __projects__ = "https://github.com/explosion/projects" diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 0c9a32b93..df4bca53d 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -81,11 +81,8 @@ def download( def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str: dl_tpl = "{m}-{v}/{m}-{v}{s}" - egg_tpl = "#egg={m}=={v}" suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX filename = dl_tpl.format(m=model_name, v=version, s=suffix) - if sdist: - filename += egg_tpl.format(m=model_name, v=version) return filename diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index dc48909bc..a356a6a7a 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,4 +1,4 @@ -from ...attrs import LIKE_NUM +from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX _num_words = [ @@ -63,4 +63,55 @@ def like_num(text): return False -LEX_ATTRS = {LIKE_NUM: like_num} +def _cyr_to_latin_norm(text): + # fmt: off + # source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py + SR_CYR_TO_LAT_DICT = { + u'А': u'A', u'а': u'a', + u'Б': u'B', u'б': u'b', + u'В': u'V', u'в': u'v', + u'Г': u'G', u'г': u'g', + u'Д': u'D', u'д': u'd', + u'Ђ': u'Đ', u'ђ': u'đ', + u'Е': u'E', u'е': u'e', + u'Ж': u'Ž', u'ж': u'ž', + u'З': u'Z', u'з': u'z', + u'И': u'I', u'и': u'i', + u'Ј': u'J', u'ј': u'j', + u'К': u'K', u'к': u'k', + u'Л': u'L', u'л': u'l', + u'Љ': u'Lj', u'љ': u'lj', + u'М': u'M', u'м': u'm', + u'Н': u'N', u'н': u'n', + u'Њ': u'Nj', u'њ': u'nj', + u'О': u'O', u'о': u'o', + u'П': u'P', u'п': u'p', + u'Р': u'R', u'р': u'r', + u'С': u'S', u'с': u's', + u'Т': u'T', u'т': u't', + u'Ћ': u'Ć', u'ћ': u'ć', + u'У': u'U', u'у': u'u', + u'Ф': u'F', u'ф': u'f', + u'Х': u'H', u'х': u'h', + u'Ц': u'C', u'ц': u'c', + u'Ч': u'Č', u'ч': u'č', + u'Џ': u'Dž', u'џ': u'dž', + u'Ш': u'Š', u'ш': u'š', + } + # fmt: on + return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text) + + +def norm(text): + return _cyr_to_latin_norm(text).lower() + + +def prefix(text): + return _cyr_to_latin_norm(text)[0] + + +def suffix(text): + return _cyr_to_latin_norm(text)[-3:] + + +LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix} diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index dcaa3e239..053306088 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,3 +1,4 @@ +from .lex_attrs import _cyr_to_latin_norm from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH, NORM from ...util import update_exc @@ -89,5 +90,7 @@ _slang_exc = [ for slang_desc in _slang_exc: _exc[slang_desc[ORTH]] = [slang_desc] +for _exc_key in _exc: + _exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM]) TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index fa92e5e2d..e8819e628 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -2,15 +2,15 @@ import pytest @pytest.mark.parametrize( - "text,norms,lemmas", + "text,norms", [ - ("о.г.", ["ове године"], ["ова година"]), - ("чет.", ["четвртак"], ["четвртак"]), - ("гђа", ["госпођа"], ["госпођа"]), - ("ил'", ["или"], ["или"]), + ("о.г.", ["ove godine"]), + ("чет.", ["četvrtak"]), + ("гђа", ["gospođa"]), + ("ил'", ["ili"]), ], ) -def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas): +def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms): tokens = sr_tokenizer(text) assert len(tokens) == 1 assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/sr/test_lex_attrs.py b/spacy/tests/lang/sr/test_lex_attrs.py new file mode 100644 index 000000000..4a8039df5 --- /dev/null +++ b/spacy/tests/lang/sr/test_lex_attrs.py @@ -0,0 +1,17 @@ +import pytest + + +@pytest.mark.parametrize( + "text,like_num,norm,prefix,suffix", + [ + ("нула", True, "nula", "n", "ula"), + ("Казна", False, "kazna", "K", "zna"), + ], +) +def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix): + tokens = sr_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == like_num + assert tokens[0].norm_ == norm + assert tokens[0].prefix_ == prefix + assert tokens[0].suffix_ == suffix diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index e90617852..9cf759c55 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -133,10 +133,11 @@ def init_vocab( logger.info("Added vectors: %s", vectors) # warn if source model vectors are not identical sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {}) - vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) - for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): - if vectors_hash != sourced_vectors_hash: - warnings.warn(Warnings.W113.format(name=sourced_component)) + if len(sourced_vectors_hashes) > 0: + vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"])) + for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items(): + if vectors_hash != sourced_vectors_hash: + warnings.warn(Warnings.W113.format(name=sourced_component)) logger.info("Finished initializing nlp object") diff --git a/website/meta/universe.json b/website/meta/universe.json index 0bfcb6166..a5d0f779f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -23,6 +23,33 @@ "category": ["model", "research"], "tags": ["sigs", "prescription","pharma"] }, + { + "id": "latincy", + "title": "LatinCy", + "thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png", + "slogan": "Synthetic trained spaCy pipelines for Latin NLP", + "description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.", + "url": "https://huggingface.co/latincy", + "code_example": [ + "# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl", + "import spacy", + "nlp = spacy.load('la_core_web_lg')", + "doc = nlp('Haec narrantur a poetis de Perseo')", + "", + "print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')", + "", + "# > Haec, haec, hic, DET" + ], + "code_language": "python", + "author": "Patrick J. Burns", + "author_links": { + "twitter": "@diyclassics", + "github": "diyclassics", + "website": "https://diyclassics.github.io/" + }, + "category": ["pipeline", "research"], + "tags": ["latin"] + }, { "id": "spacy-wasm", "title": "spacy-wasm",