Fix Lemmatizer.get_lookups_config

This commit is contained in:
Ines Montani 2020-10-03 17:16:10 +02:00
parent dd542ec6a4
commit 7c4ab7e82c
5 changed files with 31 additions and 55 deletions

View File

@ -1,4 +1,4 @@
from typing import List, Dict from typing import List, Tuple
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token
@ -15,17 +15,10 @@ class FrenchLemmatizer(Lemmatizer):
""" """
@classmethod @classmethod
def get_lookups_config(cls, mode: str) -> Dict: def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule": if mode == "rule":
return { required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
"required_tables": [ return (required, [])
"lemma_lookup",
"lemma_rules",
"lemma_exc",
"lemma_index",
],
"optional_tables": [],
}
else: else:
return super().get_lookups_config(mode) return super().get_lookups_config(mode)

View File

@ -1,4 +1,4 @@
from typing import List, Dict from typing import List, Tuple
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token
@ -6,16 +6,10 @@ from ...tokens import Token
class DutchLemmatizer(Lemmatizer): class DutchLemmatizer(Lemmatizer):
@classmethod @classmethod
def get_lookups_config(cls, mode: str) -> Dict: def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule": if mode == "rule":
return { required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
"required_tables": [ return (required, [])
"lemma_lookup",
"lemma_rules",
"lemma_exc",
"lemma_index",
],
}
else: else:
return super().get_lookups_config(mode) return super().get_lookups_config(mode)

View File

@ -1,4 +1,4 @@
from typing import List, Dict from typing import List, Dict, Tuple
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
from ...tokens import Token from ...tokens import Token
@ -11,21 +11,16 @@ class PolishLemmatizer(Lemmatizer):
# lemmatization, as well as case-sensitive lemmatization for nouns. # lemmatization, as well as case-sensitive lemmatization for nouns.
@classmethod @classmethod
def get_lookups_config(cls, mode: str) -> Dict: def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "pos_lookup": if mode == "pos_lookup":
return { # fmt: off
"required_tables": [ required = [
"lemma_lookup_adj", "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv",
"lemma_lookup_adp", "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num",
"lemma_lookup_adv", "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb"
"lemma_lookup_aux",
"lemma_lookup_noun",
"lemma_lookup_num",
"lemma_lookup_part",
"lemma_lookup_pron",
"lemma_lookup_verb",
] ]
} # fmt: on
return (required, [])
else: else:
return super().get_lookups_config(mode) return super().get_lookups_config(mode)

View File

@ -23,8 +23,9 @@ def test_lemmatizer_initialize(lang, capfd):
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
lang_cls = get_lang_class(lang)
# Test that languages can be initialized # Test that languages can be initialized
nlp = get_lang_class(lang)() nlp = lang_cls()
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
assert not lemmatizer.lookups.tables assert not lemmatizer.lookups.tables
nlp.config["initialize"]["components"]["lemmatizer"] = { nlp.config["initialize"]["components"]["lemmatizer"] = {
@ -41,7 +42,13 @@ def test_lemmatizer_initialize(lang, capfd):
assert doc[0].lemma_ == "y" assert doc[0].lemma_ == "y"
# Test initialization by calling .initialize() directly # Test initialization by calling .initialize() directly
nlp = get_lang_class(lang)() nlp = lang_cls()
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
lemmatizer.initialize(lookups=lemmatizer_init_lookups()) lemmatizer.initialize(lookups=lemmatizer_init_lookups())
assert nlp("x")[0].lemma_ == "y" assert nlp("x")[0].lemma_ == "y"
# Test lookups config format
for mode in ("rule", "lookup", "pos_lookup"):
required, optional = lemmatizer.get_lookups_config(mode)
assert isinstance(required, list)
assert isinstance(optional, list)

View File

@ -191,22 +191,9 @@ Returns the lookups configuration settings for a given mode for use in
[`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups). [`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups).
| Name | Description | | Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------- |
| `mode` | The lemmatizer mode. ~~str~~ | | `mode` | The lemmatizer mode. ~~str~~ |
| **RETURNS** | The lookups configuration settings for this mode. Includes the keys `"required_tables"` and `"optional_tables"`, mapped to a list of table string names. ~~Dict[str, List[str]]~~ | | **RETURNS** | The required table names and the optional table names. ~~Tuple[List[str], List[str]]~~ |
## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
Load and validate lookups tables. If the provided lookups is `None`, load the
default lookups tables according to the language and mode settings. Confirm that
all required tables for the language and mode are present.
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------- |
| `lang` | The language. ~~str~~ |
| `mode` | The lemmatizer mode. ~~str~~ |
| `lookups` | The provided lookups, may be `None` if the default lookups should be loaded. ~~Optional[Lookups]~~ |
| **RETURNS** | The lookups. ~~Lookups~~ |
## Lemmatizer.to_disk {#to_disk tag="method"} ## Lemmatizer.to_disk {#to_disk tag="method"}