mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Make lemmatizers use initialize logic (#6182)
* Make lemmatizer use initialize logic and tidy up * Fix typo * Raise for uninitialized tables
This commit is contained in:
		
							parent
							
								
									df06f7a792
								
							
						
					
					
						commit
						f0b30aedad
					
				| 
						 | 
				
			
			@ -477,6 +477,8 @@ class Errors:
 | 
			
		|||
    E201 = ("Span index out of range.")
 | 
			
		||||
 | 
			
		||||
    # TODO: fix numbering after merging develop into master
 | 
			
		||||
    E912 = ("Failed to initialize lemmatizer. Missing lemmatizer table(s) found "
 | 
			
		||||
            "for mode '{mode}'. Required tables: {tables}. Found: {found}.")
 | 
			
		||||
    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
 | 
			
		||||
            "config.cfg or override it on the CLI?")
 | 
			
		||||
    E914 = ("Executing {name} callback failed. Expected the function to "
 | 
			
		||||
| 
						 | 
				
			
			@ -556,10 +558,10 @@ class Errors:
 | 
			
		|||
    E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}")
 | 
			
		||||
    E954 = ("The Tok2Vec listener did not receive any valid input from an upstream "
 | 
			
		||||
            "component.")
 | 
			
		||||
    E955 = ("Can't find table(s) '{table}' for language '{lang}' in "
 | 
			
		||||
            "spacy-lookups-data. If you want to initialize a blank nlp object, "
 | 
			
		||||
            "make sure you have the spacy-lookups-data package installed or "
 | 
			
		||||
            "remove the [initialize.lookups] block from your config.")
 | 
			
		||||
    E955 = ("Can't find table(s) {table} for language '{lang}' in "
 | 
			
		||||
            "spacy-lookups-data. Make sure you have the package installed or "
 | 
			
		||||
            "provide your own lookup tables if no default lookups are available "
 | 
			
		||||
            "for your language.")
 | 
			
		||||
    E956 = ("Can't find component '{name}' in [components] block in the config. "
 | 
			
		||||
            "Available components: {opts}")
 | 
			
		||||
    E957 = ("Writing directly to Language.factories isn't needed anymore in "
 | 
			
		||||
| 
						 | 
				
			
			@ -685,9 +687,8 @@ class Errors:
 | 
			
		|||
    E1002 = ("Span index out of range.")
 | 
			
		||||
    E1003 = ("Unsupported lemmatizer mode '{mode}'.")
 | 
			
		||||
    E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. "
 | 
			
		||||
             "Required tables '{tables}', found '{found}'. If you are not "
 | 
			
		||||
             "providing custom lookups, make sure you have the package "
 | 
			
		||||
             "spacy-lookups-data installed.")
 | 
			
		||||
             "Required tables: {tables}. Found: {found}. Maybe you forgot to "
 | 
			
		||||
             "call nlp.initialize() to load in the data?")
 | 
			
		||||
    E1005 = ("Unable to set attribute '{attr}' in tokenizer exception for "
 | 
			
		||||
             "'{chunk}'. Tokenizer exceptions are only allowed to specify "
 | 
			
		||||
             "`ORTH` and `NORM`.")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,7 +4,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...pipeline import Lemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -24,18 +23,11 @@ class Bengali(Language):
 | 
			
		|||
@Bengali.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Bengali"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,7 +7,6 @@ from .lex_attrs import LEX_ATTRS
 | 
			
		|||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .lemmatizer import GreekLemmatizer
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...language import Language
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -29,18 +28,11 @@ class Greek(Language):
 | 
			
		|||
@Greek.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return GreekLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Greek"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,6 @@ from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		|||
from .punctuation import TOKENIZER_INFIXES
 | 
			
		||||
from .lemmatizer import EnglishLemmatizer
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EnglishDefaults(Language.Defaults):
 | 
			
		||||
| 
						 | 
				
			
			@ -27,18 +26,11 @@ class English(Language):
 | 
			
		|||
@English.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return EnglishLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["English"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		|||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...pipeline import Lemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -27,18 +26,11 @@ class Persian(Language):
 | 
			
		|||
@Persian.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Persian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,7 +9,6 @@ from .stop_words import STOP_WORDS
 | 
			
		|||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from .lemmatizer import FrenchLemmatizer
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...language import Language
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -32,18 +31,11 @@ class French(Language):
 | 
			
		|||
@French.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return FrenchLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["French"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,6 @@ from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		|||
from .stop_words import STOP_WORDS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...pipeline import Lemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -27,18 +26,11 @@ class Norwegian(Language):
 | 
			
		|||
@Norwegian.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Norwegian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,4 @@
 | 
			
		|||
from typing import Optional
 | 
			
		||||
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
 | 
			
		||||
from .stop_words import STOP_WORDS
 | 
			
		||||
| 
						 | 
				
			
			@ -8,7 +7,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		|||
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
 | 
			
		||||
from .punctuation import TOKENIZER_SUFFIXES
 | 
			
		||||
from .lemmatizer import DutchLemmatizer
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...language import Language
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -29,18 +27,11 @@ class Dutch(Language):
 | 
			
		|||
@Dutch.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return DutchLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Dutch"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,18 +34,11 @@ class Polish(Language):
 | 
			
		|||
@Polish.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "pos_lookup", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "pos_lookup"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return PolishLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Polish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,7 +6,6 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 | 
			
		|||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .lemmatizer import RussianLemmatizer
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RussianDefaults(Language.Defaults):
 | 
			
		||||
| 
						 | 
				
			
			@ -23,17 +22,11 @@ class Russian(Language):
 | 
			
		|||
@Russian.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "pymorphy2"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return RussianLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Russian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,7 +5,6 @@ from .stop_words import STOP_WORDS
 | 
			
		|||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .syntax_iterators import SYNTAX_ITERATORS
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
from ...pipeline import Lemmatizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -30,18 +29,11 @@ class Swedish(Language):
 | 
			
		|||
@Swedish.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "rule", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "rule"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Swedish"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,7 +7,6 @@ from .stop_words import STOP_WORDS
 | 
			
		|||
from .lex_attrs import LEX_ATTRS
 | 
			
		||||
from .lemmatizer import UkrainianLemmatizer
 | 
			
		||||
from ...language import Language
 | 
			
		||||
from ...lookups import Lookups
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class UkrainianDefaults(Language.Defaults):
 | 
			
		||||
| 
						 | 
				
			
			@ -24,17 +23,11 @@ class Ukrainian(Language):
 | 
			
		|||
@Ukrainian.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={"model": None, "mode": "pymorphy2", "lookups": None},
 | 
			
		||||
    default_config={"model": None, "mode": "pymorphy2"},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
    nlp: Language,
 | 
			
		||||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
):
 | 
			
		||||
    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups)
 | 
			
		||||
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str):
 | 
			
		||||
    return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
__all__ = ["Ukrainian"]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,26 +1,25 @@
 | 
			
		|||
from typing import Optional, List, Dict, Any
 | 
			
		||||
from typing import Optional, List, Dict, Any, Callable, Iterable, Iterator, Union
 | 
			
		||||
from typing import Tuple
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from .pipe import Pipe
 | 
			
		||||
from ..errors import Errors
 | 
			
		||||
from ..language import Language
 | 
			
		||||
from ..training import Example
 | 
			
		||||
from ..lookups import Lookups, load_lookups
 | 
			
		||||
from ..scorer import Scorer
 | 
			
		||||
from ..tokens import Doc, Token
 | 
			
		||||
from ..vocab import Vocab
 | 
			
		||||
from ..training import validate_examples
 | 
			
		||||
from ..util import logger, SimpleFrozenList
 | 
			
		||||
from .. import util
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@Language.factory(
 | 
			
		||||
    "lemmatizer",
 | 
			
		||||
    assigns=["token.lemma"],
 | 
			
		||||
    default_config={
 | 
			
		||||
        "model": None,
 | 
			
		||||
        "mode": "lookup",
 | 
			
		||||
        "lookups": None,
 | 
			
		||||
        "overwrite": False,
 | 
			
		||||
    },
 | 
			
		||||
    default_config={"model": None, "mode": "lookup", "overwrite": False},
 | 
			
		||||
    default_score_weights={"lemma_acc": 1.0},
 | 
			
		||||
)
 | 
			
		||||
def make_lemmatizer(
 | 
			
		||||
| 
						 | 
				
			
			@ -28,13 +27,9 @@ def make_lemmatizer(
 | 
			
		|||
    model: Optional[Model],
 | 
			
		||||
    name: str,
 | 
			
		||||
    mode: str,
 | 
			
		||||
    lookups: Optional[Lookups],
 | 
			
		||||
    overwrite: bool = False,
 | 
			
		||||
):
 | 
			
		||||
    lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups)
 | 
			
		||||
    return Lemmatizer(
 | 
			
		||||
        nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite
 | 
			
		||||
    )
 | 
			
		||||
    return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Lemmatizer(Pipe):
 | 
			
		||||
| 
						 | 
				
			
			@ -46,59 +41,19 @@ class Lemmatizer(Pipe):
 | 
			
		|||
    """
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def get_lookups_config(cls, mode: str) -> Dict:
 | 
			
		||||
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
 | 
			
		||||
        """Returns the lookups configuration settings for a given mode for use
 | 
			
		||||
        in Lemmatizer.load_lookups.
 | 
			
		||||
 | 
			
		||||
        mode (str): The lemmatizer mode.
 | 
			
		||||
        RETURNS (dict): The lookups configuration settings for this mode.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
 | 
			
		||||
        RETURNS (Tuple[List[str], List[str]]): The required and optional
 | 
			
		||||
            lookup tables for this mode.
 | 
			
		||||
        """
 | 
			
		||||
        if mode == "lookup":
 | 
			
		||||
            return {
 | 
			
		||||
                "required_tables": ["lemma_lookup"],
 | 
			
		||||
            }
 | 
			
		||||
            return (["lemma_lookup"], [])
 | 
			
		||||
        elif mode == "rule":
 | 
			
		||||
            return {
 | 
			
		||||
                "required_tables": ["lemma_rules"],
 | 
			
		||||
                "optional_tables": ["lemma_exc", "lemma_index"],
 | 
			
		||||
            }
 | 
			
		||||
        return {}
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups]) -> Lookups:
 | 
			
		||||
        """Load and validate lookups tables. If the provided lookups is None,
 | 
			
		||||
        load the default lookups tables according to the language and mode
 | 
			
		||||
        settings. Confirm that all required tables for the language and mode
 | 
			
		||||
        are present.
 | 
			
		||||
 | 
			
		||||
        lang (str): The language code.
 | 
			
		||||
        mode (str): The lemmatizer mode.
 | 
			
		||||
        lookups (Lookups): The provided lookups, may be None if the default
 | 
			
		||||
            lookups should be loaded.
 | 
			
		||||
        RETURNS (Lookups): The Lookups object.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#get_lookups_config
 | 
			
		||||
        """
 | 
			
		||||
        config = cls.get_lookups_config(mode)
 | 
			
		||||
        required_tables = config.get("required_tables", [])
 | 
			
		||||
        optional_tables = config.get("optional_tables", [])
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            lookups = load_lookups(lang=lang, tables=required_tables)
 | 
			
		||||
            optional_lookups = load_lookups(
 | 
			
		||||
                lang=lang, tables=optional_tables, strict=False
 | 
			
		||||
            )
 | 
			
		||||
            for table in optional_lookups.tables:
 | 
			
		||||
                lookups.set_table(table, optional_lookups.get_table(table))
 | 
			
		||||
        for table in required_tables:
 | 
			
		||||
            if table not in lookups:
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    Errors.E1004.format(
 | 
			
		||||
                        mode=mode, tables=required_tables, found=lookups.tables
 | 
			
		||||
                    )
 | 
			
		||||
                )
 | 
			
		||||
        return lookups
 | 
			
		||||
            return (["lemma_rules"], ["lemma_exc", "lemma_index"])
 | 
			
		||||
        return ([], [])
 | 
			
		||||
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
| 
						 | 
				
			
			@ -107,7 +62,6 @@ class Lemmatizer(Pipe):
 | 
			
		|||
        name: str = "lemmatizer",
 | 
			
		||||
        *,
 | 
			
		||||
        mode: str = "lookup",
 | 
			
		||||
        lookups: Optional[Lookups] = None,
 | 
			
		||||
        overwrite: bool = False,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """Initialize a Lemmatizer.
 | 
			
		||||
| 
						 | 
				
			
			@ -116,9 +70,6 @@ class Lemmatizer(Pipe):
 | 
			
		|||
        model (Model): A model (not yet implemented).
 | 
			
		||||
        name (str): The component name. Defaults to "lemmatizer".
 | 
			
		||||
        mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup".
 | 
			
		||||
        lookups (Lookups): The lookups object containing the (optional) tables
 | 
			
		||||
            such as "lemma_rules", "lemma_index", "lemma_exc" and
 | 
			
		||||
            "lemma_lookup". Defaults to None
 | 
			
		||||
        overwrite (bool): Whether to overwrite existing lemmas. Defaults to
 | 
			
		||||
            `False`.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -128,8 +79,9 @@ class Lemmatizer(Pipe):
 | 
			
		|||
        self.model = model
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self._mode = mode
 | 
			
		||||
        self.lookups = lookups if lookups is not None else Lookups()
 | 
			
		||||
        self.lookups = Lookups()
 | 
			
		||||
        self.overwrite = overwrite
 | 
			
		||||
        self._validated = False
 | 
			
		||||
        if self.mode == "lookup":
 | 
			
		||||
            self.lemmatize = self.lookup_lemmatize
 | 
			
		||||
        elif self.mode == "rule":
 | 
			
		||||
| 
						 | 
				
			
			@ -153,12 +105,56 @@ class Lemmatizer(Pipe):
 | 
			
		|||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#call
 | 
			
		||||
        """
 | 
			
		||||
        if not self._validated:
 | 
			
		||||
            self._validate_tables(Errors.E1004)
 | 
			
		||||
        for token in doc:
 | 
			
		||||
            if self.overwrite or token.lemma == 0:
 | 
			
		||||
                token.lemma_ = self.lemmatize(token)[0]
 | 
			
		||||
        return doc
 | 
			
		||||
 | 
			
		||||
    def pipe(self, stream, *, batch_size=128):
 | 
			
		||||
    def initialize(
 | 
			
		||||
        self,
 | 
			
		||||
        get_examples: Optional[Callable[[], Iterable[Example]]] = None,
 | 
			
		||||
        *,
 | 
			
		||||
        nlp: Optional[Language] = None,
 | 
			
		||||
        lookups: Optional[Lookups] = None,
 | 
			
		||||
    ):
 | 
			
		||||
        """Initialize the lemmatizer and load in data.
 | 
			
		||||
 | 
			
		||||
        get_examples (Callable[[], Iterable[Example]]): Function that
 | 
			
		||||
            returns a representative sample of gold-standard Example objects.
 | 
			
		||||
        nlp (Language): The current nlp object the component is part of.
 | 
			
		||||
        lookups (Lookups): The lookups object containing the (optional) tables
 | 
			
		||||
            such as "lemma_rules", "lemma_index", "lemma_exc" and
 | 
			
		||||
            "lemma_lookup". Defaults to None.
 | 
			
		||||
        """
 | 
			
		||||
        required_tables, optional_tables = self.get_lookups_config(self.mode)
 | 
			
		||||
        if lookups is None:
 | 
			
		||||
            logger.debug("Lemmatizer: loading tables from spacy-lookups-data")
 | 
			
		||||
            lookups = load_lookups(lang=self.vocab.lang, tables=required_tables)
 | 
			
		||||
            optional_lookups = load_lookups(
 | 
			
		||||
                lang=self.vocab.lang, tables=optional_tables, strict=False
 | 
			
		||||
            )
 | 
			
		||||
            for table in optional_lookups.tables:
 | 
			
		||||
                lookups.set_table(table, optional_lookups.get_table(table))
 | 
			
		||||
        self.lookups = lookups
 | 
			
		||||
        self._validate_tables(Errors.E1004)
 | 
			
		||||
 | 
			
		||||
    def _validate_tables(self, error_message: str = Errors.E912) -> None:
 | 
			
		||||
        """Check that the lookups are correct for the current mode."""
 | 
			
		||||
        required_tables, optional_tables = self.get_lookups_config(self.mode)
 | 
			
		||||
        for table in required_tables:
 | 
			
		||||
            if table not in self.lookups:
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    error_message.format(
 | 
			
		||||
                        mode=self.mode,
 | 
			
		||||
                        tables=required_tables,
 | 
			
		||||
                        found=self.lookups.tables,
 | 
			
		||||
                    )
 | 
			
		||||
                )
 | 
			
		||||
        self._validated = True
 | 
			
		||||
 | 
			
		||||
    def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
 | 
			
		||||
        """Apply the pipe to a stream of documents. This usually happens under
 | 
			
		||||
        the hood when the nlp object is called on a text and all components are
 | 
			
		||||
        applied to the Doc.
 | 
			
		||||
| 
						 | 
				
			
			@ -263,7 +259,7 @@ class Lemmatizer(Pipe):
 | 
			
		|||
        """
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    def score(self, examples, **kwargs) -> Dict[str, Any]:
 | 
			
		||||
    def score(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
 | 
			
		||||
        """Score a batch of examples.
 | 
			
		||||
 | 
			
		||||
        examples (Iterable[Example]): The examples to score.
 | 
			
		||||
| 
						 | 
				
			
			@ -274,58 +270,66 @@ class Lemmatizer(Pipe):
 | 
			
		|||
        validate_examples(examples, "Lemmatizer.score")
 | 
			
		||||
        return Scorer.score_token_attr(examples, "lemma", **kwargs)
 | 
			
		||||
 | 
			
		||||
    def to_disk(self, path, *, exclude=tuple()):
 | 
			
		||||
        """Save the current state to a directory.
 | 
			
		||||
    def to_disk(
 | 
			
		||||
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
 | 
			
		||||
    ):
 | 
			
		||||
        """Serialize the pipe to disk.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory, which will be created if
 | 
			
		||||
            it doesn't exist.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
        path (str / Path): Path to a directory.
 | 
			
		||||
        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/vocab#to_disk
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#to_disk
 | 
			
		||||
        """
 | 
			
		||||
        serialize = {}
 | 
			
		||||
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
 | 
			
		||||
        serialize["lookups"] = lambda p: self.lookups.to_disk(p)
 | 
			
		||||
        util.to_disk(path, serialize, exclude)
 | 
			
		||||
 | 
			
		||||
    def from_disk(self, path, *, exclude=tuple()):
 | 
			
		||||
        """Loads state from a directory. Modifies the object in place and
 | 
			
		||||
        returns it.
 | 
			
		||||
    def from_disk(
 | 
			
		||||
        self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
 | 
			
		||||
    ) -> "Lemmatizer":
 | 
			
		||||
        """Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (Vocab): The modified `Vocab` object.
 | 
			
		||||
        path (str / Path): Path to a directory.
 | 
			
		||||
        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (Lemmatizer): The modified Lemmatizer object.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/vocab#to_disk
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#from_disk
 | 
			
		||||
        """
 | 
			
		||||
        deserialize = {}
 | 
			
		||||
        deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
 | 
			
		||||
        deserialize["lookups"] = lambda p: self.lookups.from_disk(p)
 | 
			
		||||
        util.from_disk(path, deserialize, exclude)
 | 
			
		||||
        self._validate_tables()
 | 
			
		||||
        return self
 | 
			
		||||
 | 
			
		||||
    def to_bytes(self, *, exclude=tuple()) -> bytes:
 | 
			
		||||
        """Serialize the current state to a binary string.
 | 
			
		||||
    def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
 | 
			
		||||
        """Serialize the pipe to a bytestring.
 | 
			
		||||
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (bytes): The serialized form of the `Vocab` object.
 | 
			
		||||
        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (bytes): The serialized object.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/vocab#to_bytes
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#to_bytes
 | 
			
		||||
        """
 | 
			
		||||
        serialize = {}
 | 
			
		||||
        serialize["vocab"] = self.vocab.to_bytes
 | 
			
		||||
        serialize["lookups"] = self.lookups.to_bytes
 | 
			
		||||
        return util.to_bytes(serialize, exclude)
 | 
			
		||||
 | 
			
		||||
    def from_bytes(self, bytes_data: bytes, *, exclude=tuple()):
 | 
			
		||||
        """Load state from a binary string.
 | 
			
		||||
    def from_bytes(
 | 
			
		||||
        self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
 | 
			
		||||
    ) -> "Lemmatizer":
 | 
			
		||||
        """Load the pipe from a bytestring.
 | 
			
		||||
 | 
			
		||||
        bytes_data (bytes): The data to load from.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (Vocab): The `Vocab` object.
 | 
			
		||||
        bytes_data (bytes): The serialized pipe.
 | 
			
		||||
        exclude (Iterable[str]): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (Lemmatizer): The loaded Lemmatizer.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/vocab#from_bytes
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/lemmatizer#from_bytes
 | 
			
		||||
        """
 | 
			
		||||
        deserialize = {}
 | 
			
		||||
        deserialize["vocab"] = lambda b: self.vocab.from_bytes(b)
 | 
			
		||||
        deserialize["lookups"] = lambda b: self.lookups.from_bytes(b)
 | 
			
		||||
        util.from_bytes(bytes_data, deserialize, exclude)
 | 
			
		||||
        self._validate_tables()
 | 
			
		||||
        return self
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -17,16 +17,31 @@ def test_lemmatizer_initialize(lang, capfd):
 | 
			
		|||
    @registry.misc("lemmatizer_init_lookups")
 | 
			
		||||
    def lemmatizer_init_lookups():
 | 
			
		||||
        lookups = Lookups()
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope"})
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope", "x": "y"})
 | 
			
		||||
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
 | 
			
		||||
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
 | 
			
		||||
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
 | 
			
		||||
        return lookups
 | 
			
		||||
 | 
			
		||||
    """Test that languages can be initialized."""
 | 
			
		||||
    # Test that languages can be initialized
 | 
			
		||||
    nlp = get_lang_class(lang)()
 | 
			
		||||
    nlp.add_pipe("lemmatizer", config={"lookups": {"@misc": "lemmatizer_init_lookups"}})
 | 
			
		||||
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
 | 
			
		||||
    assert not lemmatizer.lookups.tables
 | 
			
		||||
    nlp.config["initialize"]["components"]["lemmatizer"] = {
 | 
			
		||||
        "lookups": {"@misc": "lemmatizer_init_lookups"}
 | 
			
		||||
    }
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        nlp("x")
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
    assert lemmatizer.lookups.tables
 | 
			
		||||
    doc = nlp("x")
 | 
			
		||||
    # Check for stray print statements (see #3342)
 | 
			
		||||
    doc = nlp("test")  # noqa: F841
 | 
			
		||||
    captured = capfd.readouterr()
 | 
			
		||||
    assert not captured.out
 | 
			
		||||
    assert doc[0].lemma_ == "y"
 | 
			
		||||
 | 
			
		||||
    # Test initialization by calling .initialize() directly
 | 
			
		||||
    nlp = get_lang_class(lang)()
 | 
			
		||||
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
 | 
			
		||||
    lemmatizer.initialize(lookups=lemmatizer_init_lookups())
 | 
			
		||||
    assert nlp("x")[0].lemma_ == "y"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,61 +8,52 @@ from ..util import make_tempdir
 | 
			
		|||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def nlp():
 | 
			
		||||
    return English()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def lemmatizer(nlp):
 | 
			
		||||
    @registry.misc("cope_lookups")
 | 
			
		||||
    def cope_lookups():
 | 
			
		||||
        lookups = Lookups()
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope"})
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
 | 
			
		||||
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
 | 
			
		||||
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
 | 
			
		||||
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
 | 
			
		||||
        return lookups
 | 
			
		||||
 | 
			
		||||
    lemmatizer = nlp.add_pipe(
 | 
			
		||||
        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
 | 
			
		||||
    )
 | 
			
		||||
    return lemmatizer
 | 
			
		||||
    nlp = English()
 | 
			
		||||
    nlp.config["initialize"]["components"]["lemmatizer"] = {
 | 
			
		||||
        "lookups": {"@misc": "cope_lookups"}
 | 
			
		||||
    }
 | 
			
		||||
    return nlp
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_lemmatizer_init(nlp):
 | 
			
		||||
    @registry.misc("cope_lookups")
 | 
			
		||||
    def cope_lookups():
 | 
			
		||||
        lookups = Lookups()
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope"})
 | 
			
		||||
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
 | 
			
		||||
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
 | 
			
		||||
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
 | 
			
		||||
        return lookups
 | 
			
		||||
 | 
			
		||||
    lemmatizer = nlp.add_pipe(
 | 
			
		||||
        "lemmatizer", config={"mode": "lookup", "lookups": {"@misc": "cope_lookups"}}
 | 
			
		||||
    )
 | 
			
		||||
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
 | 
			
		||||
    assert isinstance(lemmatizer.lookups, Lookups)
 | 
			
		||||
    assert not lemmatizer.lookups.tables
 | 
			
		||||
    assert lemmatizer.mode == "lookup"
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        nlp("test")
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
    assert lemmatizer.lookups.tables
 | 
			
		||||
    assert nlp("cope")[0].lemma_ == "cope"
 | 
			
		||||
    assert nlp("coped")[0].lemma_ == "cope"
 | 
			
		||||
    # replace any tables from spacy-lookups-data
 | 
			
		||||
    lemmatizer.lookups = Lookups()
 | 
			
		||||
    doc = nlp("coping")
 | 
			
		||||
    # lookup with no tables sets text as lemma
 | 
			
		||||
    assert doc[0].lemma_ == "coping"
 | 
			
		||||
 | 
			
		||||
    assert nlp("cope")[0].lemma_ == "cope"
 | 
			
		||||
    assert nlp("coped")[0].lemma_ == "coped"
 | 
			
		||||
    nlp.remove_pipe("lemmatizer")
 | 
			
		||||
 | 
			
		||||
    @registry.misc("empty_lookups")
 | 
			
		||||
    def empty_lookups():
 | 
			
		||||
        return Lookups()
 | 
			
		||||
 | 
			
		||||
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"})
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        nlp.add_pipe(
 | 
			
		||||
            "lemmatizer",
 | 
			
		||||
            config={"mode": "lookup", "lookups": {"@misc": "empty_lookups"}},
 | 
			
		||||
        )
 | 
			
		||||
        # Can't initialize without required tables
 | 
			
		||||
        lemmatizer.initialize(lookups=Lookups())
 | 
			
		||||
    lookups = Lookups()
 | 
			
		||||
    lookups.add_table("lemma_lookup", {})
 | 
			
		||||
    lemmatizer.initialize(lookups=lookups)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_lemmatizer_config(nlp, lemmatizer):
 | 
			
		||||
def test_lemmatizer_config(nlp):
 | 
			
		||||
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
 | 
			
		||||
    doc = nlp.make_doc("coping")
 | 
			
		||||
    doc[0].pos_ = "VERB"
 | 
			
		||||
    assert doc[0].lemma_ == ""
 | 
			
		||||
| 
						 | 
				
			
			@ -78,20 +69,21 @@ def test_lemmatizer_config(nlp, lemmatizer):
 | 
			
		|||
    assert doc[0].lemma_ == "cope"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_lemmatizer_serialize(nlp, lemmatizer):
 | 
			
		||||
    @registry.misc("cope_lookups")
 | 
			
		||||
def test_lemmatizer_serialize(nlp):
 | 
			
		||||
    lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"})
 | 
			
		||||
    nlp.initialize()
 | 
			
		||||
 | 
			
		||||
    def cope_lookups():
 | 
			
		||||
        lookups = Lookups()
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope"})
 | 
			
		||||
        lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"})
 | 
			
		||||
        lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
 | 
			
		||||
        lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
 | 
			
		||||
        lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
 | 
			
		||||
        return lookups
 | 
			
		||||
 | 
			
		||||
    nlp2 = English()
 | 
			
		||||
    lemmatizer2 = nlp2.add_pipe(
 | 
			
		||||
        "lemmatizer", config={"mode": "rule", "lookups": {"@misc": "cope_lookups"}}
 | 
			
		||||
    )
 | 
			
		||||
    lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"})
 | 
			
		||||
    lemmatizer2.initialize(lookups=cope_lookups())
 | 
			
		||||
    lemmatizer2.from_bytes(lemmatizer.to_bytes())
 | 
			
		||||
    assert lemmatizer.to_bytes() == lemmatizer2.to_bytes()
 | 
			
		||||
    assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,9 +49,8 @@ data format used by the lookup and rule-based lemmatizers, see
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Setting     | Description                                                                       |
 | 
			
		||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| ----------- | --------------------------------------------------------------------------------- |
 | 
			
		||||
| `mode`      | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
 | 
			
		||||
| `lookups`   | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
 | 
			
		||||
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~               |
 | 
			
		||||
| `model`     | **Not yet implemented:** the model to use. ~~Model~~                              |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -77,13 +76,12 @@ shortcut for this and instantiate the component using its string name and
 | 
			
		|||
[`nlp.add_pipe`](/api/language#add_pipe).
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                         |
 | 
			
		||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| -------------- | --------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                    |
 | 
			
		||||
| `model`        | **Not yet implemented:** The model to use. ~~Model~~                                                |
 | 
			
		||||
| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
 | 
			
		||||
| _keyword-only_ |                                                                                                     |
 | 
			
		||||
| mode           | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~                   |
 | 
			
		||||
| lookups        | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
 | 
			
		||||
| overwrite      | Whether to overwrite existing lemmas. ~~bool~                                                       |
 | 
			
		||||
 | 
			
		||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -127,6 +125,37 @@ applied to the `Doc` in order.
 | 
			
		|||
| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | 
			
		||||
| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 | 
			
		||||
 | 
			
		||||
## Lemmatizer.initialize {#initialize tag="method"}
 | 
			
		||||
 | 
			
		||||
Initialize the lemmatizer and load any data resources. This method is typically
 | 
			
		||||
called by [`Language.initialize`](/api/language#initialize) and lets you
 | 
			
		||||
customize arguments it receives via the
 | 
			
		||||
[`[initialize.components]`](/api/data-formats#config-initialize) block in the
 | 
			
		||||
config. The loading only happens during initialization, typically before
 | 
			
		||||
training. At runtime, all data is loaded from disk.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
> ```python
 | 
			
		||||
> lemmatizer = nlp.add_pipe("lemmatizer")
 | 
			
		||||
> lemmatizer.initialize(lookups=lookups)
 | 
			
		||||
> ```
 | 
			
		||||
>
 | 
			
		||||
> ```ini
 | 
			
		||||
> ### config.cfg
 | 
			
		||||
> [initialize.components.lemmatizer]
 | 
			
		||||
>
 | 
			
		||||
> [initialize.components.lemmatizer.lookups]
 | 
			
		||||
> @misc = "load_my_lookups.v1"
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Description                                                                                                                                                                                                                                                                         |
 | 
			
		||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. Defaults to `None`. ~~Optional[Callable[[], Iterable[Example]]]~~                                                                                                                 |
 | 
			
		||||
| _keyword-only_ |                                                                                                                                                                                                                                                                                     |
 | 
			
		||||
| `nlp`          | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                |
 | 
			
		||||
| `lookups`      | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
 | 
			
		||||
 | 
			
		||||
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
 | 
			
		||||
 | 
			
		||||
Lemmatize a token using a lookup-based approach. If no lemma is found, the
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user