mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 04:40:20 +03:00
merge master
This commit is contained in:
commit
b8775ca3cb
54
.github/workflows/tests.yml
vendored
54
.github/workflows/tests.yml
vendored
|
@ -107,22 +107,22 @@ jobs:
|
||||||
- name: Test import
|
- name: Test import
|
||||||
run: python -W error -c "import spacy"
|
run: python -W error -c "import spacy"
|
||||||
|
|
||||||
- name: "Test download CLI"
|
# - name: "Test download CLI"
|
||||||
run: |
|
# run: |
|
||||||
python -m spacy download ca_core_news_sm
|
# python -m spacy download ca_core_news_sm
|
||||||
python -m spacy download ca_core_news_md
|
# python -m spacy download ca_core_news_md
|
||||||
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
- name: "Test download_url in info CLI"
|
# - name: "Test download_url in info CLI"
|
||||||
run: |
|
# run: |
|
||||||
python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
# python -W error -m spacy info ca_core_news_sm | grep -q download_url
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
- name: "Test no warnings on load (#11713)"
|
# - name: "Test no warnings on load (#11713)"
|
||||||
run: |
|
# run: |
|
||||||
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test convert CLI"
|
- name: "Test convert CLI"
|
||||||
run: |
|
run: |
|
||||||
|
@ -146,17 +146,17 @@ jobs:
|
||||||
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
python -m spacy train ner.cfg --paths.train ner-token-per-line-conll2003.spacy --paths.dev ner-token-per-line-conll2003.spacy --training.max_steps 10 --gpu-id -1
|
||||||
if: matrix.python_version == '3.9'
|
if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Test assemble CLI"
|
# - name: "Test assemble CLI"
|
||||||
run: |
|
# run: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
|
||||||
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
#
|
||||||
- name: "Test assemble CLI vectors warning"
|
# - name: "Test assemble CLI vectors warning"
|
||||||
run: |
|
# run: |
|
||||||
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
|
||||||
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
|
||||||
if: matrix.python_version == '3.9'
|
# if: matrix.python_version == '3.9'
|
||||||
|
|
||||||
- name: "Install test requirements"
|
- name: "Install test requirements"
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.5.0"
|
__version__ = "3.6.0.dev0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -81,11 +81,8 @@ def download(
|
||||||
|
|
||||||
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
def get_model_filename(model_name: str, version: str, sdist: bool = False) -> str:
|
||||||
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
dl_tpl = "{m}-{v}/{m}-{v}{s}"
|
||||||
egg_tpl = "#egg={m}=={v}"
|
|
||||||
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX
|
||||||
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
filename = dl_tpl.format(m=model_name, v=version, s=suffix)
|
||||||
if sdist:
|
|
||||||
filename += egg_tpl.format(m=model_name, v=version)
|
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
@ -63,4 +63,55 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
def _cyr_to_latin_norm(text):
|
||||||
|
# fmt: off
|
||||||
|
# source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py
|
||||||
|
SR_CYR_TO_LAT_DICT = {
|
||||||
|
u'А': u'A', u'а': u'a',
|
||||||
|
u'Б': u'B', u'б': u'b',
|
||||||
|
u'В': u'V', u'в': u'v',
|
||||||
|
u'Г': u'G', u'г': u'g',
|
||||||
|
u'Д': u'D', u'д': u'd',
|
||||||
|
u'Ђ': u'Đ', u'ђ': u'đ',
|
||||||
|
u'Е': u'E', u'е': u'e',
|
||||||
|
u'Ж': u'Ž', u'ж': u'ž',
|
||||||
|
u'З': u'Z', u'з': u'z',
|
||||||
|
u'И': u'I', u'и': u'i',
|
||||||
|
u'Ј': u'J', u'ј': u'j',
|
||||||
|
u'К': u'K', u'к': u'k',
|
||||||
|
u'Л': u'L', u'л': u'l',
|
||||||
|
u'Љ': u'Lj', u'љ': u'lj',
|
||||||
|
u'М': u'M', u'м': u'm',
|
||||||
|
u'Н': u'N', u'н': u'n',
|
||||||
|
u'Њ': u'Nj', u'њ': u'nj',
|
||||||
|
u'О': u'O', u'о': u'o',
|
||||||
|
u'П': u'P', u'п': u'p',
|
||||||
|
u'Р': u'R', u'р': u'r',
|
||||||
|
u'С': u'S', u'с': u's',
|
||||||
|
u'Т': u'T', u'т': u't',
|
||||||
|
u'Ћ': u'Ć', u'ћ': u'ć',
|
||||||
|
u'У': u'U', u'у': u'u',
|
||||||
|
u'Ф': u'F', u'ф': u'f',
|
||||||
|
u'Х': u'H', u'х': u'h',
|
||||||
|
u'Ц': u'C', u'ц': u'c',
|
||||||
|
u'Ч': u'Č', u'ч': u'č',
|
||||||
|
u'Џ': u'Dž', u'џ': u'dž',
|
||||||
|
u'Ш': u'Š', u'ш': u'š',
|
||||||
|
}
|
||||||
|
# fmt: on
|
||||||
|
return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text)
|
||||||
|
|
||||||
|
|
||||||
|
def norm(text):
|
||||||
|
return _cyr_to_latin_norm(text).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def prefix(text):
|
||||||
|
return _cyr_to_latin_norm(text)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def suffix(text):
|
||||||
|
return _cyr_to_latin_norm(text)[-3:]
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from .lex_attrs import _cyr_to_latin_norm
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
@ -89,5 +90,7 @@ _slang_exc = [
|
||||||
for slang_desc in _slang_exc:
|
for slang_desc in _slang_exc:
|
||||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||||
|
|
||||||
|
for _exc_key in _exc:
|
||||||
|
_exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -2,15 +2,15 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,norms,lemmas",
|
"text,norms",
|
||||||
[
|
[
|
||||||
("о.г.", ["ове године"], ["ова година"]),
|
("о.г.", ["ove godine"]),
|
||||||
("чет.", ["четвртак"], ["четвртак"]),
|
("чет.", ["četvrtak"]),
|
||||||
("гђа", ["госпођа"], ["госпођа"]),
|
("гђа", ["gospođa"]),
|
||||||
("ил'", ["или"], ["или"]),
|
("ил'", ["ili"]),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
|
||||||
tokens = sr_tokenizer(text)
|
tokens = sr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
17
spacy/tests/lang/sr/test_lex_attrs.py
Normal file
17
spacy/tests/lang/sr/test_lex_attrs.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,like_num,norm,prefix,suffix",
|
||||||
|
[
|
||||||
|
("нула", True, "nula", "n", "ula"),
|
||||||
|
("Казна", False, "kazna", "K", "zna"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix):
|
||||||
|
tokens = sr_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == like_num
|
||||||
|
assert tokens[0].norm_ == norm
|
||||||
|
assert tokens[0].prefix_ == prefix
|
||||||
|
assert tokens[0].suffix_ == suffix
|
|
@ -133,6 +133,7 @@ def init_vocab(
|
||||||
logger.info("Added vectors: %s", vectors)
|
logger.info("Added vectors: %s", vectors)
|
||||||
# warn if source model vectors are not identical
|
# warn if source model vectors are not identical
|
||||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||||
|
if len(sourced_vectors_hashes) > 0:
|
||||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||||
if vectors_hash != sourced_vectors_hash:
|
if vectors_hash != sourced_vectors_hash:
|
||||||
|
|
|
@ -23,6 +23,33 @@
|
||||||
"category": ["model", "research"],
|
"category": ["model", "research"],
|
||||||
"tags": ["sigs", "prescription","pharma"]
|
"tags": ["sigs", "prescription","pharma"]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "latincy",
|
||||||
|
"title": "LatinCy",
|
||||||
|
"thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
|
||||||
|
"slogan": "Synthetic trained spaCy pipelines for Latin NLP",
|
||||||
|
"description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
|
||||||
|
"url": "https://huggingface.co/latincy",
|
||||||
|
"code_example": [
|
||||||
|
"# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
|
||||||
|
"import spacy",
|
||||||
|
"nlp = spacy.load('la_core_web_lg')",
|
||||||
|
"doc = nlp('Haec narrantur a poetis de Perseo')",
|
||||||
|
"",
|
||||||
|
"print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
|
||||||
|
"",
|
||||||
|
"# > Haec, haec, hic, DET"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Patrick J. Burns",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "@diyclassics",
|
||||||
|
"github": "diyclassics",
|
||||||
|
"website": "https://diyclassics.github.io/"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "research"],
|
||||||
|
"tags": ["latin"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-wasm",
|
"id": "spacy-wasm",
|
||||||
"title": "spacy-wasm",
|
"title": "spacy-wasm",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user