mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
Merge branch 'master' into master
This commit is contained in:
commit
158887a0f6
|
@ -1,4 +1,4 @@
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM, NORM, PREFIX, SUFFIX
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
|
@ -63,4 +63,55 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
def _cyr_to_latin_norm(text):
|
||||||
|
# fmt: off
|
||||||
|
# source: https://github.com/opendatakosovo/cyrillic-transliteration/blob/v1.1.1/cyrtranslit/mapping.py
|
||||||
|
SR_CYR_TO_LAT_DICT = {
|
||||||
|
u'А': u'A', u'а': u'a',
|
||||||
|
u'Б': u'B', u'б': u'b',
|
||||||
|
u'В': u'V', u'в': u'v',
|
||||||
|
u'Г': u'G', u'г': u'g',
|
||||||
|
u'Д': u'D', u'д': u'd',
|
||||||
|
u'Ђ': u'Đ', u'ђ': u'đ',
|
||||||
|
u'Е': u'E', u'е': u'e',
|
||||||
|
u'Ж': u'Ž', u'ж': u'ž',
|
||||||
|
u'З': u'Z', u'з': u'z',
|
||||||
|
u'И': u'I', u'и': u'i',
|
||||||
|
u'Ј': u'J', u'ј': u'j',
|
||||||
|
u'К': u'K', u'к': u'k',
|
||||||
|
u'Л': u'L', u'л': u'l',
|
||||||
|
u'Љ': u'Lj', u'љ': u'lj',
|
||||||
|
u'М': u'M', u'м': u'm',
|
||||||
|
u'Н': u'N', u'н': u'n',
|
||||||
|
u'Њ': u'Nj', u'њ': u'nj',
|
||||||
|
u'О': u'O', u'о': u'o',
|
||||||
|
u'П': u'P', u'п': u'p',
|
||||||
|
u'Р': u'R', u'р': u'r',
|
||||||
|
u'С': u'S', u'с': u's',
|
||||||
|
u'Т': u'T', u'т': u't',
|
||||||
|
u'Ћ': u'Ć', u'ћ': u'ć',
|
||||||
|
u'У': u'U', u'у': u'u',
|
||||||
|
u'Ф': u'F', u'ф': u'f',
|
||||||
|
u'Х': u'H', u'х': u'h',
|
||||||
|
u'Ц': u'C', u'ц': u'c',
|
||||||
|
u'Ч': u'Č', u'ч': u'č',
|
||||||
|
u'Џ': u'Dž', u'џ': u'dž',
|
||||||
|
u'Ш': u'Š', u'ш': u'š',
|
||||||
|
}
|
||||||
|
# fmt: on
|
||||||
|
return "".join(SR_CYR_TO_LAT_DICT.get(c, c) for c in text)
|
||||||
|
|
||||||
|
|
||||||
|
def norm(text):
|
||||||
|
return _cyr_to_latin_norm(text).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def prefix(text):
|
||||||
|
return _cyr_to_latin_norm(text)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def suffix(text):
|
||||||
|
return _cyr_to_latin_norm(text)[-3:]
|
||||||
|
|
||||||
|
|
||||||
|
LEX_ATTRS = {LIKE_NUM: like_num, NORM: norm, PREFIX: prefix, SUFFIX: suffix}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from .lex_attrs import _cyr_to_latin_norm
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ...symbols import ORTH, NORM
|
from ...symbols import ORTH, NORM
|
||||||
from ...util import update_exc
|
from ...util import update_exc
|
||||||
|
@ -89,5 +90,7 @@ _slang_exc = [
|
||||||
for slang_desc in _slang_exc:
|
for slang_desc in _slang_exc:
|
||||||
_exc[slang_desc[ORTH]] = [slang_desc]
|
_exc[slang_desc[ORTH]] = [slang_desc]
|
||||||
|
|
||||||
|
for _exc_key in _exc:
|
||||||
|
_exc[_exc_key][0][NORM] = _cyr_to_latin_norm(_exc[_exc_key][0][NORM])
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|
||||||
|
|
|
@ -2,15 +2,15 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,norms,lemmas",
|
"text,norms",
|
||||||
[
|
[
|
||||||
("о.г.", ["ове године"], ["ова година"]),
|
("о.г.", ["ove godine"]),
|
||||||
("чет.", ["четвртак"], ["четвртак"]),
|
("чет.", ["četvrtak"]),
|
||||||
("гђа", ["госпођа"], ["госпођа"]),
|
("гђа", ["gospođa"]),
|
||||||
("ил'", ["или"], ["или"]),
|
("ил'", ["ili"]),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms, lemmas):
|
def test_sr_tokenizer_abbrev_exceptions(sr_tokenizer, text, norms):
|
||||||
tokens = sr_tokenizer(text)
|
tokens = sr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
17
spacy/tests/lang/sr/test_lex_attrs.py
Normal file
17
spacy/tests/lang/sr/test_lex_attrs.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,like_num,norm,prefix,suffix",
|
||||||
|
[
|
||||||
|
("нула", True, "nula", "n", "ula"),
|
||||||
|
("Казна", False, "kazna", "K", "zna"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_lex_attrs(sr_tokenizer, text, like_num, norm, prefix, suffix):
|
||||||
|
tokens = sr_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].like_num == like_num
|
||||||
|
assert tokens[0].norm_ == norm
|
||||||
|
assert tokens[0].prefix_ == prefix
|
||||||
|
assert tokens[0].suffix_ == suffix
|
|
@ -133,10 +133,11 @@ def init_vocab(
|
||||||
logger.info("Added vectors: %s", vectors)
|
logger.info("Added vectors: %s", vectors)
|
||||||
# warn if source model vectors are not identical
|
# warn if source model vectors are not identical
|
||||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
if len(sourced_vectors_hashes) > 0:
|
||||||
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||||
if vectors_hash != sourced_vectors_hash:
|
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
||||||
warnings.warn(Warnings.W113.format(name=sourced_component))
|
if vectors_hash != sourced_vectors_hash:
|
||||||
|
warnings.warn(Warnings.W113.format(name=sourced_component))
|
||||||
logger.info("Finished initializing nlp object")
|
logger.info("Finished initializing nlp object")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,55 @@
|
||||||
{
|
{
|
||||||
"resources": [
|
"resources": [
|
||||||
|
{
|
||||||
|
"id": "parsigs",
|
||||||
|
"title": "parsigs",
|
||||||
|
"slogan": "Structuring prescriptions text made simple using spaCy",
|
||||||
|
"description": "Parsigs is an open-source project that aims to extract the relevant dosage information from prescriptions text without compromising the patient's privacy.\n\nNotice you also need to install the model in order to use the package: `pip install https://huggingface.co/royashcenazi/en_parsigs/resolve/main/en_parsigs-any-py3-none-any.whl`",
|
||||||
|
"github": "royashcenazi/parsigs",
|
||||||
|
"pip": "parsigs",
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Roy Ashcenazi",
|
||||||
|
"code_example": [
|
||||||
|
"# You'll need to install the trained model, see instructions in the description section",
|
||||||
|
"from parsigs.parse_sig_api import StructuredSig, SigParser",
|
||||||
|
"sig_parser = SigParser()",
|
||||||
|
"",
|
||||||
|
"sig = 'Take 1 tablet of ibuprofen 200mg 3 times every day for 3 weeks'",
|
||||||
|
"parsed_sig = sig_parser.parse(sig)"
|
||||||
|
],
|
||||||
|
"author_links": {
|
||||||
|
"github": "royashcenazi"
|
||||||
|
},
|
||||||
|
"category": ["model", "research", "biomedical"],
|
||||||
|
"tags": ["sigs", "prescription","pharma"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "latincy",
|
||||||
|
"title": "LatinCy",
|
||||||
|
"thumb": "https://raw.githubusercontent.com/diyclassics/la_core_web_lg/main/latincy-logo.png",
|
||||||
|
"slogan": "Synthetic trained spaCy pipelines for Latin NLP",
|
||||||
|
"description": "Set of trained general purpose Latin-language 'core' pipelines for use with spaCy. The models are trained on a large amount of available Latin data, including all five of the Latin Universal Dependency treebanks, which have been preprocessed to be compatible with each other.",
|
||||||
|
"url": "https://huggingface.co/latincy",
|
||||||
|
"code_example": [
|
||||||
|
"# pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl",
|
||||||
|
"import spacy",
|
||||||
|
"nlp = spacy.load('la_core_web_lg')",
|
||||||
|
"doc = nlp('Haec narrantur a poetis de Perseo')",
|
||||||
|
"",
|
||||||
|
"print(f'{doc[0].text}, {doc[0].norm_}, {doc[0].lemma_}, {doc[0].pos_}')",
|
||||||
|
"",
|
||||||
|
"# > Haec, haec, hic, DET"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Patrick J. Burns",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "@diyclassics",
|
||||||
|
"github": "diyclassics",
|
||||||
|
"website": "https://diyclassics.github.io/"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "research"],
|
||||||
|
"tags": ["latin"]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"id": "spacy-wasm",
|
"id": "spacy-wasm",
|
||||||
"title": "spacy-wasm",
|
"title": "spacy-wasm",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user