mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Switch ru and uk lemmatizers to pymorphy3 (#11345)
* Switch ru and uk lemmatizers to pymorphy3 * Switch to pymorphy3 in tests
This commit is contained in:
parent
b64243ed55
commit
5fa8f4faca
|
@ -28,7 +28,7 @@ class Russian(Language):
|
|||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"mode": "pymorphy3",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
|
|
|
@ -19,7 +19,7 @@ class RussianLemmatizer(Lemmatizer):
|
|||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
mode: str = "pymorphy3",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
|
@ -33,6 +33,16 @@ class RussianLemmatizer(Lemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
elif mode == "pymorphy3":
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Russian lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library. Install it with: pip install pymorphy3"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer()
|
||||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
@ -104,6 +114,9 @@ class RussianLemmatizer(Lemmatizer):
|
|||
return [analyses[0].normal_form]
|
||||
return [string]
|
||||
|
||||
def pymorphy3_lemmatize(self, token: Token) -> List[str]:
|
||||
return self.pymorphy2_lemmatize(token)
|
||||
|
||||
|
||||
def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:
|
||||
gram_map = {
|
||||
|
|
|
@ -29,7 +29,7 @@ class Ukrainian(Language):
|
|||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "pymorphy2",
|
||||
"mode": "pymorphy3",
|
||||
"overwrite": False,
|
||||
"scorer": {"@scorers": "spacy.lemmatizer_scorer.v1"},
|
||||
},
|
||||
|
|
|
@ -14,7 +14,7 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
model: Optional[Model],
|
||||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "pymorphy2",
|
||||
mode: str = "pymorphy3",
|
||||
overwrite: bool = False,
|
||||
scorer: Optional[Callable] = lemmatizer_score,
|
||||
) -> None:
|
||||
|
@ -29,6 +29,17 @@ class UkrainianLemmatizer(RussianLemmatizer):
|
|||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
elif mode == "pymorphy3":
|
||||
try:
|
||||
from pymorphy3 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Ukrainian lemmatizer mode 'pymorphy3' requires the "
|
||||
"pymorphy3 library and dictionaries. Install them with: "
|
||||
"pip install pymorphy3 pymorphy3-dicts-uk"
|
||||
) from None
|
||||
if getattr(self, "_morph", None) is None:
|
||||
self._morph = MorphAnalyzer(lang="uk")
|
||||
super().__init__(
|
||||
vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer
|
||||
)
|
||||
|
|
|
@ -323,13 +323,13 @@ def ro_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ru_tokenizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ru_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("ru")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
|
@ -401,14 +401,14 @@ def ky_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def uk_tokenizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy3")
|
||||
return get_lang_class("uk")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def uk_lemmatizer():
|
||||
pytest.importorskip("pymorphy2")
|
||||
pytest.importorskip("pymorphy2_dicts_uk")
|
||||
pytest.importorskip("pymorphy3")
|
||||
pytest.importorskip("pymorphy3_dicts_uk")
|
||||
return get_lang_class("uk")().add_pipe("lemmatizer")
|
||||
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ lemmatizer is available. The lemmatizer modes `rule` and `pos_lookup` require
|
|||
[`token.pos`](/api/token) from a previous pipeline component (see example
|
||||
pipeline configurations in the
|
||||
[pretrained pipeline design details](/models#design-cnn)) or rely on third-party
|
||||
libraries (`pymorphy2`).
|
||||
libraries (`pymorphy3`).
|
||||
|
||||
| Language | Default Mode |
|
||||
| -------- | ------------ |
|
||||
|
@ -86,9 +86,9 @@ libraries (`pymorphy2`).
|
|||
| `nb` | `rule` |
|
||||
| `nl` | `rule` |
|
||||
| `pl` | `pos_lookup` |
|
||||
| `ru` | `pymorphy2` |
|
||||
| `ru` | `pymorphy3` |
|
||||
| `sv` | `rule` |
|
||||
| `uk` | `pymorphy2` |
|
||||
| `uk` | `pymorphy3` |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
||||
|
|
|
@ -369,8 +369,8 @@
|
|||
"has_examples": true,
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "pymorphy2",
|
||||
"url": "https://github.com/kmike/pymorphy2"
|
||||
"name": "pymorphy3",
|
||||
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||
}
|
||||
],
|
||||
"models": [
|
||||
|
@ -469,8 +469,8 @@
|
|||
"has_examples": true,
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "pymorphy2",
|
||||
"url": "https://github.com/kmike/pymorphy2"
|
||||
"name": "pymorphy3",
|
||||
"url": "https://github.com/no-plagiarism/pymorphy3"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue
Block a user