Make lemmatizers use initialize logic (#6182)

* Make lemmatizer use initialize logic and tidy up

* Fix typo

* Raise for uninitialized tables
This commit is contained in:
Ines Montani 2020-10-02 15:42:36 +02:00 committed by GitHub
parent df06f7a792
commit f0b30aedad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 236 additions and 281 deletions

View File

@ -477,6 +477,8 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?") "config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to " E914 = ("Executing {name} callback failed. Expected the function to "
@ -556,10 +558,10 @@ class Errors:
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
"component.") "component.")
E955 = ("Can't find table(s) '{table}' for language '{lang}' in " E955 = ("Can't find table(s) {table} for language '{lang}' in "
"spacy-lookups-data. If you want to initialize a blank nlp object, " "spacy-lookups-data. Make sure you have the package installed or "
"make sure you have the spacy-lookups-data package installed or " "provide your own lookup tables if no default lookups are available "
"remove the [initialize.lookups] block from your config.") "for your language.")
E956 = ("Can't find component '{name}' in [components] block in the config. " E956 = ("Can't find component '{name}' in [components] block in the config. "
"Available components: {opts}") "Available components: {opts}")
E957 = ("Writing directly to Language.factories isn't needed anymore in " E957 = ("Writing directly to Language.factories isn't needed anymore in "
@ -685,9 +687,8 @@ class Errors:
E1002 = ("Span index out of range.") E1002 = ("Span index out of range.")
E1003 = ("Unsupported lemmatizer mode '{mode}'.") E1003 = ("Unsupported lemmatizer mode '{mode}'.")
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
"Required tables '{tables}', found '{found}'. If you are not " "Required tables: {tables}. Found: {found}. Maybe you forgot to "
"providing custom lookups, make sure you have the package " "call nlp.initialize() to load in the data?")
"spacy-lookups-data installed.")
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for " E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
"'{chunk}'. Tokenizer exceptions are only allowed to specify " "'{chunk}'. Tokenizer exceptions are only allowed to specify "
"`ORTH` and `NORM`.") "`ORTH` and `NORM`.")

View File

@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -24,18 +23,11 @@ class Bengali(Language):
@Bengali.factory( @Bengali.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Bengali"] __all__ = ["Bengali"]

View File

@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lemmatizer import GreekLemmatizer from .lemmatizer import GreekLemmatizer
from ...lookups import Lookups
from ...language import Language from ...language import Language
@ -29,18 +28,11 @@ class Greek(Language):
@Greek.factory( @Greek.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Greek"] __all__ = ["Greek"]

View File

@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES from .punctuation import TOKENIZER_INFIXES
from .lemmatizer import EnglishLemmatizer from .lemmatizer import EnglishLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups
class EnglishDefaults(Language.Defaults): class EnglishDefaults(Language.Defaults):
@ -27,18 +26,11 @@ class English(Language):
@English.factory( @English.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["English"] __all__ = ["English"]

View File

@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Persian(Language):
@Persian.factory( @Persian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Persian"] __all__ = ["Persian"]

View File

@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .lemmatizer import FrenchLemmatizer from .lemmatizer import FrenchLemmatizer
from ...lookups import Lookups
from ...language import Language from ...language import Language
@ -32,18 +31,11 @@ class French(Language):
@French.factory( @French.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["French"] __all__ = ["French"]

View File

@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -27,18 +26,11 @@ class Norwegian(Language):
@Norwegian.factory( @Norwegian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Norwegian"] __all__ = ["Norwegian"]

View File

@ -1,5 +1,4 @@
from typing import Optional from typing import Optional
from thinc.api import Model from thinc.api import Model
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_SUFFIXES
from .lemmatizer import DutchLemmatizer from .lemmatizer import DutchLemmatizer
from ...lookups import Lookups
from ...language import Language from ...language import Language
@ -29,18 +27,11 @@ class Dutch(Language):
@Dutch.factory( @Dutch.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Dutch"] __all__ = ["Dutch"]

View File

@ -34,18 +34,11 @@ class Polish(Language):
@Polish.factory( @Polish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "lookups": None}, default_config={"model": None, "mode": "pos_lookup"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Polish"] __all__ = ["Polish"]

View File

@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import RussianLemmatizer from .lemmatizer import RussianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
@ -23,17 +22,11 @@ class Russian(Language):
@Russian.factory( @Russian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None}, default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Russian"] __all__ = ["Russian"]

View File

@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from ...language import Language from ...language import Language
from ...lookups import Lookups
from ...pipeline import Lemmatizer from ...pipeline import Lemmatizer
@ -30,18 +29,11 @@ class Swedish(Language):
@Swedish.factory( @Swedish.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return Lemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Swedish"] __all__ = ["Swedish"]

View File

@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import UkrainianLemmatizer from .lemmatizer import UkrainianLemmatizer
from ...language import Language from ...language import Language
from ...lookups import Lookups
class UkrainianDefaults(Language.Defaults): class UkrainianDefaults(Language.Defaults):
@ -24,17 +23,11 @@ class Ukrainian(Language):
@Ukrainian.factory( @Ukrainian.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None}, default_config={"model": None, "mode": "pymorphy2"},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
nlp: Language, return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
model: Optional[Model],
name: str,
mode: str,
lookups: Optional[Lookups],
):
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
__all__ = ["Ukrainian"] __all__ = ["Ukrainian"]

View File

@ -1,26 +1,25 @@
from typing import Optional, List, Dict, Any from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
from typing import Tuple
from thinc.api import Model from thinc.api import Model
from pathlib import Path
from .pipe import Pipe from .pipe import Pipe
from ..errors import Errors from ..errors import Errors
from ..language import Language from ..language import Language
from ..training import Example
from ..lookups import Lookups, load_lookups from ..lookups import Lookups, load_lookups
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Token from ..tokens import Doc, Token
from ..vocab import Vocab from ..vocab import Vocab
from ..training import validate_examples from ..training import validate_examples
from ..util import logger, SimpleFrozenList
from .. import util from .. import util
@Language.factory( @Language.factory(
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={ default_config={"model": None, "mode": "lookup", "overwrite": False},
"model": None,
"mode": "lookup",
"lookups": None,
"overwrite": False,
},
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
@ -28,13 +27,9 @@ def make_lemmatizer(
model: Optional[Model], model: Optional[Model],
name: str, name: str,
mode: str, mode: str,
lookups: Optional[Lookups],
overwrite: bool = False, overwrite: bool = False,
): ):
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
return Lemmatizer(
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
)
class Lemmatizer(Pipe): class Lemmatizer(Pipe):
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
""" """
@classmethod @classmethod
def get_lookups_config(cls, mode: str) -> Dict: def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
"""Returns the lookups configuration settings for a given mode for use """Returns the lookups configuration settings for a given mode for use
in Lemmatizer.load_lookups. in Lemmatizer.load_lookups.
mode (str): The lemmatizer mode. mode (str): The lemmatizer mode.
RETURNS (dict): The lookups configuration settings for this mode. RETURNS (Tuple[List[str], List[str]]): The required and optional
lookup tables for this mode.
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
""" """
if mode == "lookup": if mode == "lookup":
return { return (["lemma_lookup"], [])
"required_tables": ["lemma_lookup"],
}
elif mode == "rule": elif mode == "rule":
return { return (["lemma_rules"], ["lemma_exc", "lemma_index"])
"required_tables": ["lemma_rules"], return ([], [])
"optional_tables": ["lemma_exc", "lemma_index"],
}
return {}
@classmethod
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
"""Load and validate lookups tables. If the provided lookups is None,
load the default lookups tables according to the language and mode
settings. Confirm that all required tables for the language and mode
are present.
lang (str): The language code.
mode (str): The lemmatizer mode.
lookups (Lookups): The provided lookups, may be None if the default
lookups should be loaded.
RETURNS (Lookups): The Lookups object.
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
"""
config = cls.get_lookups_config(mode)
required_tables = config.get("required_tables", [])
optional_tables = config.get("optional_tables", [])
if lookups is None:
lookups = load_lookups(lang=lang, tables=required_tables)
optional_lookups = load_lookups(
lang=lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
for table in required_tables:
if table not in lookups:
raise ValueError(
Errors.E1004.format(
mode=mode, tables=required_tables, found=lookups.tables
)
)
return lookups
def __init__( def __init__(
self, self,
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
name: str = "lemmatizer", name: str = "lemmatizer",
*, *,
mode: str = "lookup", mode: str = "lookup",
lookups: Optional[Lookups] = None,
overwrite: bool = False, overwrite: bool = False,
) -> None: ) -> None:
"""Initialize a Lemmatizer. """Initialize a Lemmatizer.
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
model (Model): A model (not yet implemented). model (Model): A model (not yet implemented).
name (str): The component name. Defaults to "lemmatizer". name (str): The component name. Defaults to "lemmatizer".
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None
overwrite (bool): Whether to overwrite existing lemmas. Defaults to overwrite (bool): Whether to overwrite existing lemmas. Defaults to
`False`. `False`.
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
self.model = model self.model = model
self.name = name self.name = name
self._mode = mode self._mode = mode
self.lookups = lookups if lookups is not None else Lookups() self.lookups = Lookups()
self.overwrite = overwrite self.overwrite = overwrite
self._validated = False
if self.mode == "lookup": if self.mode == "lookup":
self.lemmatize = self.lookup_lemmatize self.lemmatize = self.lookup_lemmatize
elif self.mode == "rule": elif self.mode == "rule":
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
DOCS: https://nightly.spacy.io/api/lemmatizer#call DOCS: https://nightly.spacy.io/api/lemmatizer#call
""" """
if not self._validated:
self._validate_tables(Errors.E1004)
for token in doc: for token in doc:
if self.overwrite or token.lemma == 0: if self.overwrite or token.lemma == 0:
token.lemma_ = self.lemmatize(token)[0] token.lemma_ = self.lemmatize(token)[0]
return doc return doc
def pipe(self, stream, *, batch_size=128): def initialize(
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
nlp: Optional[Language] = None,
lookups: Optional[Lookups] = None,
):
"""Initialize the lemmatizer and load in data.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
lookups (Lookups): The lookups object containing the (optional) tables
such as "lemma_rules", "lemma_index", "lemma_exc" and
"lemma_lookup". Defaults to None.
"""
required_tables, optional_tables = self.get_lookups_config(self.mode)
if lookups is None:
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
optional_lookups = load_lookups(
lang=self.vocab.lang, tables=optional_tables, strict=False
)
for table in optional_lookups.tables:
lookups.set_table(table, optional_lookups.get_table(table))
self.lookups = lookups
self._validate_tables(Errors.E1004)
def _validate_tables(self, error_message: str = Errors.E912) -> None:
"""Check that the lookups are correct for the current mode."""
required_tables, optional_tables = self.get_lookups_config(self.mode)
for table in required_tables:
if table not in self.lookups:
raise ValueError(
error_message.format(
mode=self.mode,
tables=required_tables,
found=self.lookups.tables,
)
)
self._validated = True
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are the hood when the nlp object is called on a text and all components are
applied to the Doc. applied to the Doc.
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
""" """
return False return False
def score(self, examples, **kwargs) -> Dict[str, Any]: def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
"""Score a batch of examples. """Score a batch of examples.
examples (Iterable[Example]): The examples to score. examples (Iterable[Example]): The examples to score.
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
validate_examples(examples, "Lemmatizer.score") validate_examples(examples, "Lemmatizer.score")
return Scorer.score_token_attr(examples, "lemma", **kwargs) return Scorer.score_token_attr(examples, "lemma", **kwargs)
def to_disk(self, path, *, exclude=tuple()): def to_disk(
"""Save the current state to a directory. self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
):
"""Serialize the pipe to disk.
path (unicode or Path): A path to a directory, which will be created if path (str / Path): Path to a directory.
it doesn't exist. exclude (Iterable[str]): String names of serialization fields to exclude.
exclude (list): String names of serialization fields to exclude.
DOCS: https://nightly.spacy.io/api/vocab#to_disk DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
""" """
serialize = {} serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["vocab"] = lambda p: self.vocab.to_disk(p)
serialize["lookups"] = lambda p: self.lookups.to_disk(p) serialize["lookups"] = lambda p: self.lookups.to_disk(p)
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk(self, path, *, exclude=tuple()): def from_disk(
"""Loads state from a directory. Modifies the object in place and self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
returns it. ) -> "Lemmatizer":
"""Load the pipe from disk. Modifies the object in place and returns it.
path (unicode or Path): A path to a directory. path (str / Path): Path to a directory.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The modified `Vocab` object. RETURNS (Lemmatizer): The modified Lemmatizer object.
DOCS: https://nightly.spacy.io/api/vocab#to_disk DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
""" """
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
deserialize["lookups"] = lambda p: self.lookups.from_disk(p) deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
util.from_disk(path, deserialize, exclude) util.from_disk(path, deserialize, exclude)
self._validate_tables()
return self
def to_bytes(self, *, exclude=tuple()) -> bytes: def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the current state to a binary string. """Serialize the pipe to a bytestring.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized form of the `Vocab` object. RETURNS (bytes): The serialized object.
DOCS: https://nightly.spacy.io/api/vocab#to_bytes DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
""" """
serialize = {} serialize = {}
serialize["vocab"] = self.vocab.to_bytes serialize["vocab"] = self.vocab.to_bytes
serialize["lookups"] = self.lookups.to_bytes serialize["lookups"] = self.lookups.to_bytes
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()): def from_bytes(
"""Load state from a binary string. self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Lemmatizer":
"""Load the pipe from a bytestring.
bytes_data (bytes): The data to load from. bytes_data (bytes): The serialized pipe.
exclude (list): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (Vocab): The `Vocab` object. RETURNS (Lemmatizer): The loaded Lemmatizer.
DOCS: https://nightly.spacy.io/api/vocab#from_bytes DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
""" """
deserialize = {} deserialize = {}
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
self._validate_tables()
return self

View File

@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
@registry.misc("lemmatizer_init_lookups") @registry.misc("lemmatizer_init_lookups")
def lemmatizer_init_lookups(): def lemmatizer_init_lookups():
lookups = Lookups() lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"}) lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
"""Test that languages can be initialized.""" # Test that languages can be initialized
nlp = get_lang_class(lang)() nlp = get_lang_class(lang)()
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}}) lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
assert not lemmatizer.lookups.tables
nlp.config["initialize"]["components"]["lemmatizer"] = {
"lookups": {"@misc": "lemmatizer_init_lookups"}
}
with pytest.raises(ValueError):
nlp("x")
nlp.initialize()
assert lemmatizer.lookups.tables
doc = nlp("x")
# Check for stray print statements (see #3342) # Check for stray print statements (see #3342)
doc = nlp("test") # noqa: F841
captured = capfd.readouterr() captured = capfd.readouterr()
assert not captured.out assert not captured.out
assert doc[0].lemma_ == "y"
# Test initialization by calling .initialize() directly
nlp = get_lang_class(lang)()
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
assert nlp("x")[0].lemma_ == "y"

View File

@ -8,61 +8,52 @@ from ..util import make_tempdir
@pytest.fixture @pytest.fixture
def nlp(): def nlp():
return English()
@pytest.fixture
def lemmatizer(nlp):
@registry.misc("cope_lookups") @registry.misc("cope_lookups")
def cope_lookups(): def cope_lookups():
lookups = Lookups() lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"}) lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
lemmatizer = nlp.add_pipe( nlp = English()
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} nlp.config["initialize"]["components"]["lemmatizer"] = {
) "lookups": {"@misc": "cope_lookups"}
return lemmatizer }
return nlp
def test_lemmatizer_init(nlp): def test_lemmatizer_init(nlp):
@registry.misc("cope_lookups") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
def cope_lookups():
lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups
lemmatizer = nlp.add_pipe(
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
)
assert isinstance(lemmatizer.lookups, Lookups) assert isinstance(lemmatizer.lookups, Lookups)
assert not lemmatizer.lookups.tables
assert lemmatizer.mode == "lookup" assert lemmatizer.mode == "lookup"
with pytest.raises(ValueError):
nlp("test")
nlp.initialize()
assert lemmatizer.lookups.tables
assert nlp("cope")[0].lemma_ == "cope"
assert nlp("coped")[0].lemma_ == "cope"
# replace any tables from spacy-lookups-data # replace any tables from spacy-lookups-data
lemmatizer.lookups = Lookups() lemmatizer.lookups = Lookups()
doc = nlp("coping")
# lookup with no tables sets text as lemma # lookup with no tables sets text as lemma
assert doc[0].lemma_ == "coping" assert nlp("cope")[0].lemma_ == "cope"
assert nlp("coped")[0].lemma_ == "coped"
nlp.remove_pipe("lemmatizer") nlp.remove_pipe("lemmatizer")
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
@registry.misc("empty_lookups")
def empty_lookups():
return Lookups()
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.add_pipe( # Can't initialize without required tables
"lemmatizer", lemmatizer.initialize(lookups=Lookups())
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}}, lookups = Lookups()
) lookups.add_table("lemma_lookup", {})
lemmatizer.initialize(lookups=lookups)
def test_lemmatizer_config(nlp, lemmatizer): def test_lemmatizer_config(nlp):
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
doc = nlp.make_doc("coping") doc = nlp.make_doc("coping")
doc[0].pos_ = "VERB" doc[0].pos_ = "VERB"
assert doc[0].lemma_ == "" assert doc[0].lemma_ == ""
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
assert doc[0].lemma_ == "cope" assert doc[0].lemma_ == "cope"
def test_lemmatizer_serialize(nlp, lemmatizer): def test_lemmatizer_serialize(nlp):
@registry.misc("cope_lookups") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
nlp.initialize()
def cope_lookups(): def cope_lookups():
lookups = Lookups() lookups = Lookups()
lookups.add_table("lemma_lookup", {"cope": "cope"}) lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
return lookups return lookups
nlp2 = English() nlp2 = English()
lemmatizer2 = nlp2.add_pipe( lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}} lemmatizer2.initialize(lookups=cope_lookups())
)
lemmatizer2.from_bytes(lemmatizer.to_bytes()) lemmatizer2.from_bytes(lemmatizer.to_bytes())
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2.make_doc("coping") doc2 = nlp2.make_doc("coping")
doc2[0].pos_ = "VERB" doc2[0].pos_ = "VERB"
assert doc2[0].lemma_ == "" assert doc2[0].lemma_ == ""
doc2 = lemmatizer(doc2) doc2 = lemmatizer(doc2)
assert doc2[0].text == "coping" assert doc2[0].text == "coping"
assert doc2[0].lemma_ == "cope" assert doc2[0].lemma_ == "cope"

View File

@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
> nlp.add_pipe("lemmatizer", config=config) > nlp.add_pipe("lemmatizer", config=config)
> ``` > ```
| Setting | Description | | Setting | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | --------------------------------------------------------------------------------- |
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | | `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ | | `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ | | `model` | **Not yet implemented:** the model to use. ~~Model~~ |
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
```python ```python
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py %%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Description | | Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | --------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | **Not yet implemented:** The model to use. ~~Model~~ | | `model` | **Not yet implemented:** The model to use. ~~Model~~ |
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ | | mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ | | overwrite | Whether to overwrite existing lemmas. ~~bool~ |
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
## Lemmatizer.\_\_call\_\_ {#call tag="method"} ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -127,6 +125,37 @@ applied to the `Doc` in order.
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Lemmatizer.initialize {#initialize tag="method"}
Initialize the lemmatizer and load any data resources. This method is typically
called by [`Language.initialize`](/api/language#initialize) and lets you
customize arguments it receives via the
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
config. The loading only happens during initialization, typically before
training. At runtime, all data is loaded from disk.
> #### Example
>
> ```python
> lemmatizer = nlp.add_pipe("lemmatizer")
> lemmatizer.initialize(lookups=lookups)
> ```
>
> ```ini
> ### config.cfg
> [initialize.components.lemmatizer]
>
> [initialize.components.lemmatizer.lookups]
> @misc = "load_my_lookups.v1"
> ```
| Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ |
| _keyword-only_ | |
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
Lemmatize a token using a lookup-based approach. If no lemma is found, the Lemmatize a token using a lookup-based approach. If no lemma is found, the