mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Make lemmatizers use initialize logic (#6182)
* Make lemmatizer use initialize logic and tidy up * Fix typo * Raise for uninitialized tables
This commit is contained in:
parent
df06f7a792
commit
f0b30aedad
|
@ -477,6 +477,8 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
|
||||
"for mode '{mode}'. Required tables: {tables}. Found: {found}.")
|
||||
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
|
||||
"config.cfg or override it on the CLI?")
|
||||
E914 = ("Executing {name} callback failed. Expected the function to "
|
||||
|
@ -556,10 +558,10 @@ class Errors:
|
|||
E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
|
||||
E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
|
||||
"component.")
|
||||
E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
|
||||
"spacy-lookups-data. If you want to initialize a blank nlp object, "
|
||||
"make sure you have the spacy-lookups-data package installed or "
|
||||
"remove the [initialize.lookups] block from your config.")
|
||||
E955 = ("Can't find table(s) {table} for language '{lang}' in "
|
||||
"spacy-lookups-data. Make sure you have the package installed or "
|
||||
"provide your own lookup tables if no default lookups are available "
|
||||
"for your language.")
|
||||
E956 = ("Can't find component '{name}' in [components] block in the config. "
|
||||
"Available components: {opts}")
|
||||
E957 = ("Writing directly to Language.factories isn't needed anymore in "
|
||||
|
@ -685,9 +687,8 @@ class Errors:
|
|||
E1002 = ("Span index out of range.")
|
||||
E1003 = ("Unsupported lemmatizer mode '{mode}'.")
|
||||
E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
|
||||
"Required tables '{tables}', found '{found}'. If you are not "
|
||||
"providing custom lookups, make sure you have the package "
|
||||
"spacy-lookups-data installed.")
|
||||
"Required tables: {tables}. Found: {found}. Maybe you forgot to "
|
||||
"call nlp.initialize() to load in the data?")
|
||||
E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
|
||||
"'{chunk}'. Tokenizer exceptions are only allowed to specify "
|
||||
"`ORTH` and `NORM`.")
|
||||
|
|
|
@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -24,18 +23,11 @@ class Bengali(Language):
|
|||
@Bengali.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Bengali"]
|
||||
|
|
|
@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
|
|||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
||||
from .lemmatizer import GreekLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -29,18 +28,11 @@ class Greek(Language):
|
|||
@Greek.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Greek"]
|
||||
|
|
|
@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
|
|||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lemmatizer import EnglishLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class EnglishDefaults(Language.Defaults):
|
||||
|
@ -27,18 +26,11 @@ class English(Language):
|
|||
@English.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["English"]
|
||||
|
|
|
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -27,18 +26,11 @@ class Persian(Language):
|
|||
@Persian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Persian"]
|
||||
|
|
|
@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from .lemmatizer import FrenchLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -32,18 +31,11 @@ class French(Language):
|
|||
@French.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["French"]
|
||||
|
|
|
@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
|
|||
from .stop_words import STOP_WORDS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -27,18 +26,11 @@ class Norwegian(Language):
|
|||
@Norwegian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Norwegian"]
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from typing import Optional
|
||||
|
||||
from thinc.api import Model
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
|
||||
from .punctuation import TOKENIZER_SUFFIXES
|
||||
from .lemmatizer import DutchLemmatizer
|
||||
from ...lookups import Lookups
|
||||
from ...language import Language
|
||||
|
||||
|
||||
|
@ -29,18 +27,11 @@ class Dutch(Language):
|
|||
@Dutch.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Dutch"]
|
||||
|
|
|
@ -34,18 +34,11 @@ class Polish(Language):
|
|||
@Polish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
||||
default_config={"model": None, "mode": "pos_lookup"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Polish"]
|
||||
|
|
|
@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
|
@ -23,17 +22,11 @@ class Russian(Language):
|
|||
@Russian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
default_config={"model": None, "mode": "pymorphy2"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Russian"]
|
||||
|
|
|
@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
from ...pipeline import Lemmatizer
|
||||
|
||||
|
||||
|
@ -30,18 +29,11 @@ class Swedish(Language):
|
|||
@Swedish.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
default_config={"model": None, "mode": "rule"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Swedish"]
|
||||
|
|
|
@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
|
|||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import UkrainianLemmatizer
|
||||
from ...language import Language
|
||||
from ...lookups import Lookups
|
||||
|
||||
|
||||
class UkrainianDefaults(Language.Defaults):
|
||||
|
@ -24,17 +23,11 @@ class Ukrainian(Language):
|
|||
@Ukrainian.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
default_config={"model": None, "mode": "pymorphy2"},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
nlp: Language,
|
||||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
|
||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
|
||||
return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
|
||||
|
||||
|
||||
__all__ = ["Ukrainian"]
|
||||
|
|
|
@ -1,26 +1,25 @@
|
|||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
|
||||
from typing import Tuple
|
||||
from thinc.api import Model
|
||||
from pathlib import Path
|
||||
|
||||
from .pipe import Pipe
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..training import Example
|
||||
from ..lookups import Lookups, load_lookups
|
||||
from ..scorer import Scorer
|
||||
from ..tokens import Doc, Token
|
||||
from ..vocab import Vocab
|
||||
from ..training import validate_examples
|
||||
from ..util import logger, SimpleFrozenList
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={
|
||||
"model": None,
|
||||
"mode": "lookup",
|
||||
"lookups": None,
|
||||
"overwrite": False,
|
||||
},
|
||||
default_config={"model": None, "mode": "lookup", "overwrite": False},
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
@ -28,13 +27,9 @@ def make_lemmatizer(
|
|||
model: Optional[Model],
|
||||
name: str,
|
||||
mode: str,
|
||||
lookups: Optional[Lookups],
|
||||
overwrite: bool = False,
|
||||
):
|
||||
lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
|
||||
return Lemmatizer(
|
||||
nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
|
||||
)
|
||||
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||
|
||||
|
||||
class Lemmatizer(Pipe):
|
||||
|
@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
|
||||
@classmethod
|
||||
def get_lookups_config(cls, mode: str) -> Dict:
|
||||
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||
"""Returns the lookups configuration settings for a given mode for use
|
||||
in Lemmatizer.load_lookups.
|
||||
|
||||
mode (str): The lemmatizer mode.
|
||||
RETURNS (dict): The lookups configuration settings for this mode.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||
RETURNS (Tuple[List[str], List[str]]): The required and optional
|
||||
lookup tables for this mode.
|
||||
"""
|
||||
if mode == "lookup":
|
||||
return {
|
||||
"required_tables": ["lemma_lookup"],
|
||||
}
|
||||
return (["lemma_lookup"], [])
|
||||
elif mode == "rule":
|
||||
return {
|
||||
"required_tables": ["lemma_rules"],
|
||||
"optional_tables": ["lemma_exc", "lemma_index"],
|
||||
}
|
||||
return {}
|
||||
|
||||
@classmethod
|
||||
def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
|
||||
"""Load and validate lookups tables. If the provided lookups is None,
|
||||
load the default lookups tables according to the language and mode
|
||||
settings. Confirm that all required tables for the language and mode
|
||||
are present.
|
||||
|
||||
lang (str): The language code.
|
||||
mode (str): The lemmatizer mode.
|
||||
lookups (Lookups): The provided lookups, may be None if the default
|
||||
lookups should be loaded.
|
||||
RETURNS (Lookups): The Lookups object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
|
||||
"""
|
||||
config = cls.get_lookups_config(mode)
|
||||
required_tables = config.get("required_tables", [])
|
||||
optional_tables = config.get("optional_tables", [])
|
||||
if lookups is None:
|
||||
lookups = load_lookups(lang=lang, tables=required_tables)
|
||||
optional_lookups = load_lookups(
|
||||
lang=lang, tables=optional_tables, strict=False
|
||||
)
|
||||
for table in optional_lookups.tables:
|
||||
lookups.set_table(table, optional_lookups.get_table(table))
|
||||
for table in required_tables:
|
||||
if table not in lookups:
|
||||
raise ValueError(
|
||||
Errors.E1004.format(
|
||||
mode=mode, tables=required_tables, found=lookups.tables
|
||||
)
|
||||
)
|
||||
return lookups
|
||||
return (["lemma_rules"], ["lemma_exc", "lemma_index"])
|
||||
return ([], [])
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
|
|||
name: str = "lemmatizer",
|
||||
*,
|
||||
mode: str = "lookup",
|
||||
lookups: Optional[Lookups] = None,
|
||||
overwrite: bool = False,
|
||||
) -> None:
|
||||
"""Initialize a Lemmatizer.
|
||||
|
@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
|
|||
model (Model): A model (not yet implemented).
|
||||
name (str): The component name. Defaults to "lemmatizer".
|
||||
mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||
"lemma_lookup". Defaults to None
|
||||
overwrite (bool): Whether to overwrite existing lemmas. Defaults to
|
||||
`False`.
|
||||
|
||||
|
@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
|
|||
self.model = model
|
||||
self.name = name
|
||||
self._mode = mode
|
||||
self.lookups = lookups if lookups is not None else Lookups()
|
||||
self.lookups = Lookups()
|
||||
self.overwrite = overwrite
|
||||
self._validated = False
|
||||
if self.mode == "lookup":
|
||||
self.lemmatize = self.lookup_lemmatize
|
||||
elif self.mode == "rule":
|
||||
|
@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#call
|
||||
"""
|
||||
if not self._validated:
|
||||
self._validate_tables(Errors.E1004)
|
||||
for token in doc:
|
||||
if self.overwrite or token.lemma == 0:
|
||||
token.lemma_ = self.lemmatize(token)[0]
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, *, batch_size=128):
|
||||
def initialize(
|
||||
self,
|
||||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||||
*,
|
||||
nlp: Optional[Language] = None,
|
||||
lookups: Optional[Lookups] = None,
|
||||
):
|
||||
"""Initialize the lemmatizer and load in data.
|
||||
|
||||
get_examples (Callable[[], Iterable[Example]]): Function that
|
||||
returns a representative sample of gold-standard Example objects.
|
||||
nlp (Language): The current nlp object the component is part of.
|
||||
lookups (Lookups): The lookups object containing the (optional) tables
|
||||
such as "lemma_rules", "lemma_index", "lemma_exc" and
|
||||
"lemma_lookup". Defaults to None.
|
||||
"""
|
||||
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||
if lookups is None:
|
||||
logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
|
||||
lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
|
||||
optional_lookups = load_lookups(
|
||||
lang=self.vocab.lang, tables=optional_tables, strict=False
|
||||
)
|
||||
for table in optional_lookups.tables:
|
||||
lookups.set_table(table, optional_lookups.get_table(table))
|
||||
self.lookups = lookups
|
||||
self._validate_tables(Errors.E1004)
|
||||
|
||||
def _validate_tables(self, error_message: str = Errors.E912) -> None:
|
||||
"""Check that the lookups are correct for the current mode."""
|
||||
required_tables, optional_tables = self.get_lookups_config(self.mode)
|
||||
for table in required_tables:
|
||||
if table not in self.lookups:
|
||||
raise ValueError(
|
||||
error_message.format(
|
||||
mode=self.mode,
|
||||
tables=required_tables,
|
||||
found=self.lookups.tables,
|
||||
)
|
||||
)
|
||||
self._validated = True
|
||||
|
||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
|
||||
"""Apply the pipe to a stream of documents. This usually happens under
|
||||
the hood when the nlp object is called on a text and all components are
|
||||
applied to the Doc.
|
||||
|
@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
|
|||
"""
|
||||
return False
|
||||
|
||||
def score(self, examples, **kwargs) -> Dict[str, Any]:
|
||||
def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||
"""Score a batch of examples.
|
||||
|
||||
examples (Iterable[Example]): The examples to score.
|
||||
|
@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
|
|||
validate_examples(examples, "Lemmatizer.score")
|
||||
return Scorer.score_token_attr(examples, "lemma", **kwargs)
|
||||
|
||||
def to_disk(self, path, *, exclude=tuple()):
|
||||
"""Save the current state to a directory.
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
):
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["lookups"] = lambda p: self.lookups.to_disk(p)
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, *, exclude=tuple()):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "Lemmatizer":
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
path (str / Path): Path to a directory.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Lemmatizer): The modified Lemmatizer object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_disk
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
|
||||
deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
self._validate_tables()
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude=tuple()) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the pipe to a bytestring.
|
||||
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#to_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
|
||||
"""
|
||||
serialize = {}
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
serialize["lookups"] = self.lookups.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
|
||||
"""Load state from a binary string.
|
||||
def from_bytes(
|
||||
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "Lemmatizer":
|
||||
"""Load the pipe from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
bytes_data (bytes): The serialized pipe.
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
RETURNS (Lemmatizer): The loaded Lemmatizer.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/vocab#from_bytes
|
||||
DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
|
||||
"""
|
||||
deserialize = {}
|
||||
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
|
||||
deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
self._validate_tables()
|
||||
return self
|
||||
|
|
|
@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
|
|||
@registry.misc("lemmatizer_init_lookups")
|
||||
def lemmatizer_init_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
"""Test that languages can be initialized."""
|
||||
# Test that languages can be initialized
|
||||
nlp = get_lang_class(lang)()
|
||||
nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
assert not lemmatizer.lookups.tables
|
||||
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||
"lookups": {"@misc": "lemmatizer_init_lookups"}
|
||||
}
|
||||
with pytest.raises(ValueError):
|
||||
nlp("x")
|
||||
nlp.initialize()
|
||||
assert lemmatizer.lookups.tables
|
||||
doc = nlp("x")
|
||||
# Check for stray print statements (see #3342)
|
||||
doc = nlp("test") # noqa: F841
|
||||
captured = capfd.readouterr()
|
||||
assert not captured.out
|
||||
assert doc[0].lemma_ == "y"
|
||||
|
||||
# Test initialization by calling .initialize() directly
|
||||
nlp = get_lang_class(lang)()
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
lemmatizer.initialize(lookups=lemmatizer_init_lookups())
|
||||
assert nlp("x")[0].lemma_ == "y"
|
||||
|
|
|
@ -8,61 +8,52 @@ from ..util import make_tempdir
|
|||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return English()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer(nlp):
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
return lemmatizer
|
||||
nlp = English()
|
||||
nlp.config["initialize"]["components"]["lemmatizer"] = {
|
||||
"lookups": {"@misc": "cope_lookups"}
|
||||
}
|
||||
return nlp
|
||||
|
||||
|
||||
def test_lemmatizer_init(nlp):
|
||||
@registry.misc("cope_lookups")
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
lemmatizer = nlp.add_pipe(
|
||||
"lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
assert isinstance(lemmatizer.lookups, Lookups)
|
||||
assert not lemmatizer.lookups.tables
|
||||
assert lemmatizer.mode == "lookup"
|
||||
with pytest.raises(ValueError):
|
||||
nlp("test")
|
||||
nlp.initialize()
|
||||
assert lemmatizer.lookups.tables
|
||||
assert nlp("cope")[0].lemma_ == "cope"
|
||||
assert nlp("coped")[0].lemma_ == "cope"
|
||||
# replace any tables from spacy-lookups-data
|
||||
lemmatizer.lookups = Lookups()
|
||||
doc = nlp("coping")
|
||||
# lookup with no tables sets text as lemma
|
||||
assert doc[0].lemma_ == "coping"
|
||||
|
||||
assert nlp("cope")[0].lemma_ == "cope"
|
||||
assert nlp("coped")[0].lemma_ == "coped"
|
||||
nlp.remove_pipe("lemmatizer")
|
||||
|
||||
@registry.misc("empty_lookups")
|
||||
def empty_lookups():
|
||||
return Lookups()
|
||||
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe(
|
||||
"lemmatizer",
|
||||
config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
|
||||
)
|
||||
# Can't initialize without required tables
|
||||
lemmatizer.initialize(lookups=Lookups())
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {})
|
||||
lemmatizer.initialize(lookups=lookups)
|
||||
|
||||
|
||||
def test_lemmatizer_config(nlp, lemmatizer):
|
||||
def test_lemmatizer_config(nlp):
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
nlp.initialize()
|
||||
|
||||
doc = nlp.make_doc("coping")
|
||||
doc[0].pos_ = "VERB"
|
||||
assert doc[0].lemma_ == ""
|
||||
|
@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
|
|||
assert doc[0].lemma_ == "cope"
|
||||
|
||||
|
||||
def test_lemmatizer_serialize(nlp, lemmatizer):
|
||||
@registry.misc("cope_lookups")
|
||||
def test_lemmatizer_serialize(nlp):
|
||||
lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
nlp.initialize()
|
||||
|
||||
def cope_lookups():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope"})
|
||||
lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
|
||||
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
||||
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
||||
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
||||
return lookups
|
||||
|
||||
nlp2 = English()
|
||||
lemmatizer2 = nlp2.add_pipe(
|
||||
"lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
|
||||
)
|
||||
lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
|
||||
lemmatizer2.initialize(lookups=cope_lookups())
|
||||
lemmatizer2.from_bytes(lemmatizer.to_bytes())
|
||||
assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
|
||||
assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
|
||||
|
@ -100,9 +92,9 @@ def test_lemmatizer_serialize(nlp, lemmatizer):
|
|||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2.make_doc("coping")
|
||||
doc2[0].pos_ = "VERB"
|
||||
assert doc2[0].lemma_ == ""
|
||||
doc2 = lemmatizer(doc2)
|
||||
assert doc2[0].text == "coping"
|
||||
assert doc2[0].lemma_ == "cope"
|
||||
doc2 = nlp2.make_doc("coping")
|
||||
doc2[0].pos_ = "VERB"
|
||||
assert doc2[0].lemma_ == ""
|
||||
doc2 = lemmatizer(doc2)
|
||||
assert doc2[0].text == "coping"
|
||||
assert doc2[0].lemma_ == "cope"
|
||||
|
|
|
@ -48,12 +48,11 @@ data format used by the lookup and rule-based lemmatizers, see
|
|||
> nlp.add_pipe("lemmatizer", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
|
||||
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
|
||||
| Setting | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------- |
|
||||
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
|
||||
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
|
||||
|
||||
```python
|
||||
%%GITHUB_SPACY/spacy/pipeline/lemmatizer.py
|
||||
|
@ -76,15 +75,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||
| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
|
||||
| Name | Description |
|
||||
| -------------- | --------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
|
||||
|
||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -127,6 +125,37 @@ applied to the `Doc` in order.
|
|||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Lemmatizer.initialize {#initialize tag="method"}
|
||||
|
||||
Initialize the lemmatizer and load any data resources. This method is typically
|
||||
called by [`Language.initialize`](/api/language#initialize) and lets you
|
||||
customize arguments it receives via the
|
||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
|
||||
config. The loading only happens during initialization, typically before
|
||||
training. At runtime, all data is loaded from disk.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> lemmatizer = nlp.add_pipe("lemmatizer")
|
||||
> lemmatizer.initialize(lookups=lookups)
|
||||
> ```
|
||||
>
|
||||
> ```ini
|
||||
> ### config.cfg
|
||||
> [initialize.components.lemmatizer]
|
||||
>
|
||||
> [initialize.components.lemmatizer.lookups]
|
||||
> @misc = "load_my_lookups.v1"
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `nlp` | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~ |
|
||||
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
|
||||
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
|
||||
|
||||
Lemmatize a token using a lookup-based approach. If no lemma is found, the
|
||||
|
|
Loading…
Reference in New Issue
Block a user