diff --git a/spacy/errors.py b/spacy/errors.py index 881a697f6..4edd1cbae 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,6 +477,8 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found " + "for mode '{mode}'. Required tables: {tables}. Found: {found}.") E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " "config.cfg or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " @@ -556,10 +558,10 @@ class Errors: E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " "component.") - E955 = ("Can't find table(s) '{table}' for language '{lang}' in " - "spacy-lookups-data. If you want to initialize a blank nlp object, " - "make sure you have the spacy-lookups-data package installed or " - "remove the [initialize.lookups] block from your config.") + E955 = ("Can't find table(s) {table} for language '{lang}' in " + "spacy-lookups-data. Make sure you have the package installed or " + "provide your own lookup tables if no default lookups are available " + "for your language.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " @@ -685,9 +687,8 @@ class Errors: E1002 = ("Span index out of range.") E1003 = ("Unsupported lemmatizer mode '{mode}'.") E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " - "Required tables '{tables}', found '{found}'. If you are not " - "providing custom lookups, make sure you have the package " - "spacy-lookups-data installed.") + "Required tables: {tables}. Found: {found}. Maybe you forgot to " + "call nlp.initialize() to load in the data?") E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for " "'{chunk}'. Tokenizer exceptions are only allowed to specify " "`ORTH` and `NORM`.") diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index 923e29a17..879229888 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -24,18 +23,11 @@ class Bengali(Language): @Bengali.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Bengali"] diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 1a7b19914..53069334e 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lemmatizer import GreekLemmatizer -from ...lookups import Lookups from ...language import Language @@ -29,18 +28,11 @@ class Greek(Language): @Greek.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups) - return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return GreekLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Greek"] diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index cc01f1aea..3a3ebeefd 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES from .lemmatizer import EnglishLemmatizer from ...language import Language -from ...lookups import Lookups class EnglishDefaults(Language.Defaults): @@ -27,18 +26,11 @@ class English(Language): @English.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups) - return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return EnglishLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["English"] diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index f3a6635dc..77ee3bca3 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_SUFFIXES from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -27,18 +26,11 @@ class Persian(Language): @Persian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Persian"] diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index 72e641d1f..1e0011fba 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from .lemmatizer import FrenchLemmatizer -from ...lookups import Lookups from ...language import Language @@ -32,18 +31,11 @@ class French(Language): @French.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups) - return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return FrenchLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["French"] diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 9672dfd6e..62d7707f3 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -27,18 +26,11 @@ class Norwegian(Language): @Norwegian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Norwegian"] diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 15b6b9de2..a3591f1bf 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,5 +1,4 @@ from typing import Optional - from thinc.api import Model from .stop_words import STOP_WORDS @@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer -from ...lookups import Lookups from ...language import Language @@ -29,18 +27,11 @@ class Dutch(Language): @Dutch.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups) - return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return DutchLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Dutch"] diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 573dbc6f9..f7be8a6c2 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -34,18 +34,11 @@ class Polish(Language): @Polish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pos_lookup", "lookups": None}, + default_config={"model": None, "mode": "pos_lookup"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups) - return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return PolishLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Polish"] diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 6436ae0c7..1d59ca043 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer from ...language import Language -from ...lookups import Lookups class RussianDefaults(Language.Defaults): @@ -23,17 +22,11 @@ class Russian(Language): @Russian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return RussianLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Russian"] diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index ea314f487..2490eb9ec 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS from ...language import Language -from ...lookups import Lookups from ...pipeline import Lemmatizer @@ -30,18 +29,11 @@ class Swedish(Language): @Swedish.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "rule", "lookups": None}, + default_config={"model": None, "mode": "rule"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return Lemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Swedish"] diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 006a1cf7f..73c065379 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import UkrainianLemmatizer from ...language import Language -from ...lookups import Lookups class UkrainianDefaults(Language.Defaults): @@ -24,17 +23,11 @@ class Ukrainian(Language): @Ukrainian.factory( "lemmatizer", assigns=["token.lemma"], - default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + default_config={"model": None, "mode": "pymorphy2"}, default_score_weights={"lemma_acc": 1.0}, ) -def make_lemmatizer( - nlp: Language, - model: Optional[Model], - name: str, - mode: str, - lookups: Optional[Lookups], -): - return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) +def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str): + return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode) __all__ = ["Ukrainian"] diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py index 391769604..9be596868 100644 --- a/spacy/pipeline/lemmatizer.py +++ b/spacy/pipeline/lemmatizer.py @@ -1,26 +1,25 @@ -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union +from typing import Tuple from thinc.api import Model +from pathlib import Path from .pipe import Pipe from ..errors import Errors from ..language import Language +from ..training import Example from ..lookups import Lookups, load_lookups from ..scorer import Scorer from ..tokens import Doc, Token from ..vocab import Vocab from ..training import validate_examples +from ..util import logger, SimpleFrozenList from .. import util @Language.factory( "lemmatizer", assigns=["token.lemma"], - default_config={ - "model": None, - "mode": "lookup", - "lookups": None, - "overwrite": False, - }, + default_config={"model": None, "mode": "lookup", "overwrite": False}, default_score_weights={"lemma_acc": 1.0}, ) def make_lemmatizer( @@ -28,13 +27,9 @@ def make_lemmatizer( model: Optional[Model], name: str, mode: str, - lookups: Optional[Lookups], overwrite: bool = False, ): - lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) - return Lemmatizer( - nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite - ) + return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) class Lemmatizer(Pipe): @@ -46,59 +41,19 @@ class Lemmatizer(Pipe): """ @classmethod - def get_lookups_config(cls, mode: str) -> Dict: + def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]: """Returns the lookups configuration settings for a given mode for use in Lemmatizer.load_lookups. mode (str): The lemmatizer mode. - RETURNS (dict): The lookups configuration settings for this mode. - - DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config + RETURNS (Tuple[List[str], List[str]]): The required and optional + lookup tables for this mode. """ if mode == "lookup": - return { - "required_tables": ["lemma_lookup"], - } + return (["lemma_lookup"], []) elif mode == "rule": - return { - "required_tables": ["lemma_rules"], - "optional_tables": ["lemma_exc", "lemma_index"], - } - return {} - - @classmethod - def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups: - """Load and validate lookups tables. If the provided lookups is None, - load the default lookups tables according to the language and mode - settings. Confirm that all required tables for the language and mode - are present. - - lang (str): The language code. - mode (str): The lemmatizer mode. - lookups (Lookups): The provided lookups, may be None if the default - lookups should be loaded. - RETURNS (Lookups): The Lookups object. - - DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config - """ - config = cls.get_lookups_config(mode) - required_tables = config.get("required_tables", []) - optional_tables = config.get("optional_tables", []) - if lookups is None: - lookups = load_lookups(lang=lang, tables=required_tables) - optional_lookups = load_lookups( - lang=lang, tables=optional_tables, strict=False - ) - for table in optional_lookups.tables: - lookups.set_table(table, optional_lookups.get_table(table)) - for table in required_tables: - if table not in lookups: - raise ValueError( - Errors.E1004.format( - mode=mode, tables=required_tables, found=lookups.tables - ) - ) - return lookups + return (["lemma_rules"], ["lemma_exc", "lemma_index"]) + return ([], []) def __init__( self, @@ -107,7 +62,6 @@ class Lemmatizer(Pipe): name: str = "lemmatizer", *, mode: str = "lookup", - lookups: Optional[Lookups] = None, overwrite: bool = False, ) -> None: """Initialize a Lemmatizer. @@ -116,9 +70,6 @@ class Lemmatizer(Pipe): model (Model): A model (not yet implemented). name (str): The component name. Defaults to "lemmatizer". mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". - lookups (Lookups): The lookups object containing the (optional) tables - such as "lemma_rules", "lemma_index", "lemma_exc" and - "lemma_lookup". Defaults to None overwrite (bool): Whether to overwrite existing lemmas. Defaults to `False`. @@ -128,8 +79,9 @@ class Lemmatizer(Pipe): self.model = model self.name = name self._mode = mode - self.lookups = lookups if lookups is not None else Lookups() + self.lookups = Lookups() self.overwrite = overwrite + self._validated = False if self.mode == "lookup": self.lemmatize = self.lookup_lemmatize elif self.mode == "rule": @@ -153,12 +105,56 @@ class Lemmatizer(Pipe): DOCS: https://nightly.spacy.io/api/lemmatizer#call """ + if not self._validated: + self._validate_tables(Errors.E1004) for token in doc: if self.overwrite or token.lemma == 0: token.lemma_ = self.lemmatize(token)[0] return doc - def pipe(self, stream, *, batch_size=128): + def initialize( + self, + get_examples: Optional[Callable[[], Iterable[Example]]] = None, + *, + nlp: Optional[Language] = None, + lookups: Optional[Lookups] = None, + ): + """Initialize the lemmatizer and load in data. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + lookups (Lookups): The lookups object containing the (optional) tables + such as "lemma_rules", "lemma_index", "lemma_exc" and + "lemma_lookup". Defaults to None. + """ + required_tables, optional_tables = self.get_lookups_config(self.mode) + if lookups is None: + logger.debug("Lemmatizer: loading tables from spacy-lookups-data") + lookups = load_lookups(lang=self.vocab.lang, tables=required_tables) + optional_lookups = load_lookups( + lang=self.vocab.lang, tables=optional_tables, strict=False + ) + for table in optional_lookups.tables: + lookups.set_table(table, optional_lookups.get_table(table)) + self.lookups = lookups + self._validate_tables(Errors.E1004) + + def _validate_tables(self, error_message: str = Errors.E912) -> None: + """Check that the lookups are correct for the current mode.""" + required_tables, optional_tables = self.get_lookups_config(self.mode) + for table in required_tables: + if table not in self.lookups: + raise ValueError( + error_message.format( + mode=self.mode, + tables=required_tables, + found=self.lookups.tables, + ) + ) + self._validated = True + + def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: """Apply the pipe to a stream of documents. This usually happens under the hood when the nlp object is called on a text and all components are applied to the Doc. @@ -263,7 +259,7 @@ class Lemmatizer(Pipe): """ return False - def score(self, examples, **kwargs) -> Dict[str, Any]: + def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: """Score a batch of examples. examples (Iterable[Example]): The examples to score. @@ -274,58 +270,66 @@ class Lemmatizer(Pipe): validate_examples(examples, "Lemmatizer.score") return Scorer.score_token_attr(examples, "lemma", **kwargs) - def to_disk(self, path, *, exclude=tuple()): - """Save the current state to a directory. + def to_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + ): + """Serialize the pipe to disk. - path (unicode or Path): A path to a directory, which will be created if - it doesn't exist. - exclude (list): String names of serialization fields to exclude. + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. - DOCS: https://nightly.spacy.io/api/vocab#to_disk + DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk """ serialize = {} serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p) util.to_disk(path, serialize, exclude) - def from_disk(self, path, *, exclude=tuple()): - """Loads state from a directory. Modifies the object in place and - returns it. + def from_disk( + self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() + ) -> "Lemmatizer": + """Load the pipe from disk. Modifies the object in place and returns it. - path (unicode or Path): A path to a directory. - exclude (list): String names of serialization fields to exclude. - RETURNS (Vocab): The modified `Vocab` object. + path (str / Path): Path to a directory. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (Lemmatizer): The modified Lemmatizer object. - DOCS: https://nightly.spacy.io/api/vocab#to_disk + DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk """ deserialize = {} deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p) util.from_disk(path, deserialize, exclude) + self._validate_tables() + return self - def to_bytes(self, *, exclude=tuple()) -> bytes: - """Serialize the current state to a binary string. + def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes: + """Serialize the pipe to a bytestring. - exclude (list): String names of serialization fields to exclude. - RETURNS (bytes): The serialized form of the `Vocab` object. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. - DOCS: https://nightly.spacy.io/api/vocab#to_bytes + DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes """ serialize = {} serialize["vocab"] = self.vocab.to_bytes serialize["lookups"] = self.lookups.to_bytes return util.to_bytes(serialize, exclude) - def from_bytes(self, bytes_data: bytes, *, exclude=tuple()): - """Load state from a binary string. + def from_bytes( + self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList() + ) -> "Lemmatizer": + """Load the pipe from a bytestring. - bytes_data (bytes): The data to load from. - exclude (list): String names of serialization fields to exclude. - RETURNS (Vocab): The `Vocab` object. + bytes_data (bytes): The serialized pipe. + exclude (Iterable[str]): String names of serialization fields to exclude. + RETURNS (Lemmatizer): The loaded Lemmatizer. - DOCS: https://nightly.spacy.io/api/vocab#from_bytes + DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes """ deserialize = {} deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) util.from_bytes(bytes_data, deserialize, exclude) + self._validate_tables() + return self diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py index 6e7f82341..5f45664eb 100644 --- a/spacy/tests/lang/test_lemmatizers.py +++ b/spacy/tests/lang/test_lemmatizers.py @@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd): @registry.misc("lemmatizer_init_lookups") def lemmatizer_init_lookups(): lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups - """Test that languages can be initialized.""" + # Test that languages can be initialized nlp = get_lang_class(lang)() - nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}}) + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) + assert not lemmatizer.lookups.tables + nlp.config["initialize"]["components"]["lemmatizer"] = { + "lookups": {"@misc": "lemmatizer_init_lookups"} + } + with pytest.raises(ValueError): + nlp("x") + nlp.initialize() + assert lemmatizer.lookups.tables + doc = nlp("x") # Check for stray print statements (see #3342) - doc = nlp("test") # noqa: F841 captured = capfd.readouterr() assert not captured.out + assert doc[0].lemma_ == "y" + + # Test initialization by calling .initialize() directly + nlp = get_lang_class(lang)() + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) + lemmatizer.initialize(lookups=lemmatizer_init_lookups()) + assert nlp("x")[0].lemma_ == "y" diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py index 05e15bc16..d37c87059 100644 --- a/spacy/tests/pipeline/test_lemmatizer.py +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -8,61 +8,52 @@ from ..util import make_tempdir @pytest.fixture def nlp(): - return English() - - -@pytest.fixture -def lemmatizer(nlp): @registry.misc("cope_lookups") def cope_lookups(): lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups - lemmatizer = nlp.add_pipe( - "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} - ) - return lemmatizer + nlp = English() + nlp.config["initialize"]["components"]["lemmatizer"] = { + "lookups": {"@misc": "cope_lookups"} + } + return nlp def test_lemmatizer_init(nlp): - @registry.misc("cope_lookups") - def cope_lookups(): - lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) - lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) - lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) - lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) - return lookups - - lemmatizer = nlp.add_pipe( - "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}} - ) + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) assert isinstance(lemmatizer.lookups, Lookups) + assert not lemmatizer.lookups.tables assert lemmatizer.mode == "lookup" + with pytest.raises(ValueError): + nlp("test") + nlp.initialize() + assert lemmatizer.lookups.tables + assert nlp("cope")[0].lemma_ == "cope" + assert nlp("coped")[0].lemma_ == "cope" # replace any tables from spacy-lookups-data lemmatizer.lookups = Lookups() - doc = nlp("coping") # lookup with no tables sets text as lemma - assert doc[0].lemma_ == "coping" - + assert nlp("cope")[0].lemma_ == "cope" + assert nlp("coped")[0].lemma_ == "coped" nlp.remove_pipe("lemmatizer") - - @registry.misc("empty_lookups") - def empty_lookups(): - return Lookups() - + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) with pytest.raises(ValueError): - nlp.add_pipe( - "lemmatizer", - config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}}, - ) + # Can't initialize without required tables + lemmatizer.initialize(lookups=Lookups()) + lookups = Lookups() + lookups.add_table("lemma_lookup", {}) + lemmatizer.initialize(lookups=lookups) -def test_lemmatizer_config(nlp, lemmatizer): +def test_lemmatizer_config(nlp): + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) + nlp.initialize() + doc = nlp.make_doc("coping") doc[0].pos_ = "VERB" assert doc[0].lemma_ == "" @@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer): assert doc[0].lemma_ == "cope" -def test_lemmatizer_serialize(nlp, lemmatizer): - @registry.misc("cope_lookups") +def test_lemmatizer_serialize(nlp): + lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) + nlp.initialize() + def cope_lookups(): lookups = Lookups() - lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups nlp2 = English() - lemmatizer2 = nlp2.add_pipe( - "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} - ) + lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"}) + lemmatizer2.initialize(lookups=cope_lookups()) lemmatizer2.from_bytes(lemmatizer.to_bytes()) assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables @@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer): with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) - doc2 = nlp2.make_doc("coping") - doc2[0].pos_ = "VERB" - assert doc2[0].lemma_ == "" - doc2 = lemmatizer(doc2) - assert doc2[0].text == "coping" - assert doc2[0].lemma_ == "cope" + doc2 = nlp2.make_doc("coping") + doc2[0].pos_ = "VERB" + assert doc2[0].lemma_ == "" + doc2 = lemmatizer(doc2) + assert doc2[0].text == "coping" + assert doc2[0].lemma_ == "cope" diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index f980756e5..27ea04432 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see > nlp.add_pipe("lemmatizer", config=config) > ``` -| Setting | Description | -| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | -| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ | -| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | -| `model` | **Not yet implemented:** the model to use. ~~Model~~ | +| Setting | Description | +| ----------- | --------------------------------------------------------------------------------- | +| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | +| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | +| `model` | **Not yet implemented:** the model to use. ~~Model~~ | ```python %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py @@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#add_pipe). -| Name | Description | -| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | **Not yet implemented:** The model to use. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | -| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ | -| overwrite | Whether to overwrite existing lemmas. ~~bool~ | +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | **Not yet implemented:** The model to use. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | +| overwrite | Whether to overwrite existing lemmas. ~~bool~ | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} @@ -127,6 +125,37 @@ applied to the `Doc` in order. | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ | +## Lemmatizer.initialize {#initialize tag="method"} + +Initialize the lemmatizer and load any data resources. This method is typically +called by [`Language.initialize`](/api/language#initialize) and lets you +customize arguments it receives via the +[`[initialize.components]`](/api/data-formats#config-initialize) block in the +config. The loading only happens during initialization, typically before +training. At runtime, all data is loaded from disk. + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.initialize(lookups=lookups) +> ``` +> +> ```ini +> ### config.cfg +> [initialize.components.lemmatizer] +> +> [initialize.components.lemmatizer.lookups] +> @misc = "load_my_lookups.v1" +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ | +| _keyword-only_ | | +| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ | +| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ | + ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} Lemmatize a token using a lookup-based approach. If no lemma is found, the