Make lemmatizers use initialize logic (#6182)

* Make lemmatizer use initialize logic and tidy up

* Fix typo

* Raise for uninitialized tables
This commit is contained in:
Ines Montani 2020-10-02 15:42:36 +02:00 committed by GitHub
parent df06f7a792
commit f0b30aedad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 236 additions and 281 deletions

View File

@ -477,6 +477,8 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "
@ -556,10 +558,10 @@ class Errors:
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
"spacy-lookups-data. If you want to initialize a blank nlp object, "
"make sure you have the spacy-lookups-data package installed or "
"remove the [initialize.lookups] block from your config.")
E955 = ("Can't find table(s) {table} for language '{lang}' in "
"spacy-lookups-data. Make sure you have the package installed or "
"provide your own lookup tables if no default lookups are available "
"for your language.")
E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -685,9 +687,8 @@ class Errors:
E1002 = ("Span index out of range.")
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
"Required tables '{tables}', found '{found}'. If you are not "
"providing custom lookups, make sure you have the package "
"spacy-lookups-data installed.")
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
"call nlp.initialize() to load in the data?")
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
"`ORTH` and `NORM`.")

View File

@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS
from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
@ -24,18 +23,11 @@ class Bengali(Language):
@Bengali.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Bengali"]

View File

@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language
@ -29,18 +28,11 @@ class Greek(Language):
@Greek.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Greek"]

View File

@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer
from ...language import Language
from ...lookups import Lookups
class EnglishDefaults(Language.Defaults):
@ -27,18 +26,11 @@ class English(Language):
@English.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["English"]

View File

@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Persian(Language):
@Persian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Persian"]

View File

@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer
from ...lookups import Lookups
from ...language import Language
@ -32,18 +31,11 @@ class French(Language):
@French.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["French"]

View File

@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Norwegian(Language):
@Norwegian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Norwegian"]

View File

@ -1,5 +1,4 @@
from typing import Optional
from thinc.api import Model
from .stop_words import STOP_WORDS
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer
from ...lookups import Lookups
from ...language import Language
@ -29,18 +27,11 @@ class Dutch(Language):
@Dutch.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Dutch"]

View File

@ -34,18 +34,11 @@ class Polish(Language):
@Polish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
default_config={"model": None, "mode": "pos_lookup"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Polish"]

View File

@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer
from ...language import Language
from ...lookups import Lookups
class RussianDefaults(Language.Defaults):
@ -23,17 +22,11 @@ class Russian(Language):
@Russian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Russian"]

View File

@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer
@ -30,18 +29,11 @@ class Swedish(Language):
@Swedish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return Lemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Swedish"]

View File

@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer
from ...language import Language
from ...lookups import Lookups
class UkrainianDefaults(Language.Defaults):
@ -24,17 +23,11 @@ class Ukrainian(Language):
@Ukrainian.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
nlp: Language,
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
__all__ = ["Ukrainian"]

View File

@ -1,26 +1,25 @@
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
from typing import Tuple
from thinc.api import Model
from pathlib import Path
from .pipe import Pipe
from ..errors import Errors
from ..language import Language
from ..training import Example
from ..lookups import Lookups, load_lookups
from ..scorer import Scorer
from ..tokens import Doc, Token
from ..vocab import Vocab
from ..training import validate_examples
from ..util import logger, SimpleFrozenList
from .. import util
@Language.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={
"model": None,
"mode": "lookup",
"lookups": None,
"overwrite": False,
},
default_config={"model": None, "mode": "lookup", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(
@ -28,13 +27,9 @@ def make_lemmatizer(
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
overwrite: bool = False,
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
)
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
class Lemmatizer(Pipe):
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
"""
@classmethod
def get_lookups_config(cls, mode: str) -> Dict:
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
"""Returns the lookups configuration settings for a given mode for use
in Lemmatizer.load_lookups.
mode (str): The lemmatizer mode.
RETURNS (dict): The lookups configuration settings for this mode.
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
RETURNS (Tuple[List[str], List[str]]): The required and optional
lookup tables for this mode.
"""
if mode == "lookup":
return {
"required_tables": ["lemma_lookup"],
}
return (["lemma_lookup"], [])
elif mode == "rule":
return {
"required_tables": ["lemma_rules"],
"optional_tables": ["lemma_exc", "lemma_index"],
}
return {}
@classmethod
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
"""Load and validate lookups tables. If the provided lookups is None,
load the default lookups tables according to the language and mode
settings. Confirm that all required tables for the language and mode
are present.
lang (str): The language code.
mode (str): The lemmatizer mode.
lookups (Lookups): The provided lookups, may be None if the default
lookups should be loaded.
RETURNS (Lookups): The Lookups object.
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
"""
config = cls.get_lookups_config(mode)
required_tables = config.get("required_tables", [])
optional_tables = config.get("optional_tables", [])
if lookups is None:
lookups = load_lookups(lang=lang, tables=required_tables)
optional_lookups = load_lookups(
lang=lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
for table in required_tables:
if table not in lookups:
raise ValueError(
Errors.E1004.format(
mode=mode, tables=required_tables, found=lookups.tables
)
)
return lookups
return (["lemma_rules"], ["lemma_exc", "lemma_index"])
return ([], [])
def __init__(
self,
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
name: str = "lemmatizer",
*,
mode: str = "lookup",
lookups: Optional[Lookups] = None,
overwrite: bool = False,
) -> None:
"""Initialize a Lemmatizer.
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
model (Model): A model (not yet implemented).
name (str): The component name. Defaults to "lemmatizer".
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`.
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
self.model = model
self.name = name
self._mode = mode
self.lookups = lookups if lookups is not None else Lookups()
self.lookups = Lookups()
self.overwrite = overwrite
self._validated = False
if self.mode == "lookup":
self.lemmatize = self.lookup_lemmatize
elif self.mode == "rule":
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#call
"""
if not self._validated:
self._validate_tables(Errors.E1004)
for token in doc:
if self.overwrite or token.lemma == 0:
token.lemma_ = self.lemmatize(token)[0]
return doc
def pipe(self, stream, *, batch_size=128):
def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
lookups: Optional[Lookups] = None,
):
"""Initialize the lemmatizer and load in data.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None.
"""
required_tables, optional_tables = self.get_lookups_config(self.mode)
if lookups is None:
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
optional_lookups = load_lookups(
lang=self.vocab.lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
self.lookups = lookups
self._validate_tables(Errors.E1004)
def _validate_tables(self, error_message: str = Errors.E912) -> None:
"""Check that the lookups are correct for the current mode."""
required_tables, optional_tables = self.get_lookups_config(self.mode)
for table in required_tables:
if table not in self.lookups:
raise ValueError(
error_message.format(
mode=self.mode,
tables=required_tables,
found=self.lookups.tables,
)
)
self._validated = True
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
"""
return False
def score(self, examples, **kwargs) -> Dict[str, Any]:
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples.
examples (Iterable[Example]): The examples to score.
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
):
"""Serialize the pipe to disk.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://nightly.spacy.io/api/vocab#to_disk
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()):
"""Loads state from a directory. Modifies the object in place and
returns it.
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Lemmatizer":
"""Load the pipe from disk. Modifies the object in place and returns it.
path (unicode or Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Lemmatizer): The modified Lemmatizer object.
DOCS: https://nightly.spacy.io/api/vocab#to_disk
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
"""
deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude)
self._validate_tables()
return self
def to_bytes(self, *, exclude=tuple()) -> bytes:
"""Serialize the current state to a binary string.
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the pipe to a bytestring.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
"""
serialize = {}
serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
"""Load state from a binary string.
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Lemmatizer":
"""Load the pipe from a bytestring.
bytes_data (bytes): The data to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object.
bytes_data (bytes): The serialized pipe.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Lemmatizer): The loaded Lemmatizer.
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
"""
deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables()
return self

View File

@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
@registry.misc("lemmatizer_init_lookups")
def lemmatizer_init_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
"""Test that languages can be initialized."""
# Test that languages can be initialized
nlp = get_lang_class(lang)()
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
assert not lemmatizer.lookups.tables
nlp.config["initialize"]["components"]["lemmatizer"] = {
"lookups": {"@misc": "lemmatizer_init_lookups"}
}
with pytest.raises(ValueError):
nlp("x")
nlp.initialize()
assert lemmatizer.lookups.tables
doc = nlp("x")
# Check for stray print statements (see #3342)
doc = nlp("test") # noqa: F841
captured = capfd.readouterr()
assert not captured.out
assert doc[0].lemma_ == "y"
# Test initialization by calling .initialize() directly
nlp = get_lang_class(lang)()
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
assert nlp("x")[0].lemma_ == "y"

View File

@ -8,61 +8,52 @@ from ..util import make_tempdir
@pytest.fixture
def nlp():
return English()
@pytest.fixture
def lemmatizer(nlp):
@registry.misc("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
)
return lemmatizer
nlp = English()
nlp.config["initialize"]["components"]["lemmatizer"] = {
"lookups": {"@misc": "cope_lookups"}
}
return nlp
def test_lemmatizer_init(nlp):
@registry.misc("cope_lookups")
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
)
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
assert isinstance(lemmatizer.lookups, Lookups)
assert not lemmatizer.lookups.tables
assert lemmatizer.mode == "lookup"
with pytest.raises(ValueError):
nlp("test")
nlp.initialize()
assert lemmatizer.lookups.tables
assert nlp("cope")[0].lemma_ == "cope"
assert nlp("coped")[0].lemma_ == "cope"
# replace any tables from spacy-lookups-data
lemmatizer.lookups = Lookups()
doc = nlp("coping")
# lookup with no tables sets text as lemma
assert doc[0].lemma_ == "coping"
assert nlp("cope")[0].lemma_ == "cope"
assert nlp("coped")[0].lemma_ == "coped"
nlp.remove_pipe("lemmatizer")
@registry.misc("empty_lookups")
def empty_lookups():
return Lookups()
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
with pytest.raises(ValueError):
nlp.add_pipe(
"lemmatizer",
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
)
# Can't initialize without required tables
lemmatizer.initialize(lookups=Lookups())
lookups = Lookups()
lookups.add_table("lemma_lookup", {})
lemmatizer.initialize(lookups=lookups)
def test_lemmatizer_config(nlp, lemmatizer):
def test_lemmatizer_config(nlp):
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB"
assert doc[0].lemma_ == ""
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
assert doc[0].lemma_ == "cope"
def test_lemmatizer_serialize(nlp, lemmatizer):
@registry.misc("cope_lookups")
def test_lemmatizer_serialize(nlp):
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
nlp2 = English()
lemmatizer2 = nlp2.add_pipe(
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
)
lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
lemmatizer2.initialize(lookups=cope_lookups())
lemmatizer2.from_bytes(lemmatizer.to_bytes())
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2.make_doc("coping")
doc2[0].pos_ = "VERB"
assert doc2[0].lemma_ == ""
doc2 = lemmatizer(doc2)
assert doc2[0].text == "coping"
assert doc2[0].lemma_ == "cope"
doc2 = nlp2.make_doc("coping")
doc2[0].pos_ = "VERB"
assert doc2[0].lemma_ == ""
doc2 = lemmatizer(doc2)
assert doc2[0].text == "coping"
assert doc2[0].lemma_ == "cope"

View File

@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
> nlp.add_pipe("lemmatizer", config=config)
> ```
| Setting | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
| Setting | Description |
| ----------- | --------------------------------------------------------------------------------- |
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
```python
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
| Name | Description |
| -------------- | --------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | |
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -127,6 +125,37 @@ applied to the `Doc` in order.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ |
## Lemmatizer.initialize {#initialize tag="method"}
Initialize the lemmatizer and load any data resources. This method is typically
called by [`Language.initialize`](/api/language#initialize) and lets you
customize arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config. The loading only happens during initialization, typically before
training. At runtime, all data is loaded from disk.
> #### Example
>
> ```python
> lemmatizer = nlp.add_pipe("lemmatizer")
> lemmatizer.initialize(lookups=lookups)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.lemmatizer]
>
> [initialize.components.lemmatizer.lookups]
> @misc = "load_my_lookups.v1"
> ```
| Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
Lemmatize a token using a lookup-based approach. If no lemma is found, the