From e962784531478841f902494f0f336104ba8a9a18 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 7 Aug 2020 15:27:13 +0200 Subject: [PATCH] Add Lemmatizer and simplify related components (#5848) * Add Lemmatizer and simplify related components * Add `Lemmatizer` pipe with `lookup` and `rule` modes using the `Lookups` tables. * Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma) * Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer, or morph rules) * Remove lemmatizer from `Vocab` * Adjust many many tests Differences: * No default lookup lemmas * No special treatment of TAG in `from_array` and similar required * Easier to modify labels in a `Tagger` * No extra strings added from morphology / tag map * Fix test * Initial fix for Lemmatizer config/serialization * Adjust init test to be more generic * Adjust init test to force empty Lookups * Add simple cache to rule-based lemmatizer * Convert language-specific lemmatizers Convert language-specific lemmatizers to component lemmatizers. Remove previous lemmatizer class. * Fix French and Polish lemmatizers * Remove outdated UPOS conversions * Update Russian lemmatizer init in tests * Add minimal init/run tests for custom lemmatizers * Add option to overwrite existing lemmas * Update mode setting, lookup loading, and caching * Make `mode` an immutable property * Only enforce strict `load_lookups` for known supported modes * Move caching into individual `_lemmatize` methods * Implement strict when lang is not found in lookups * Fix tables/lookups in make_lemmatizer * Reallow provided lookups and allow for stricter checks * Add lookups asset to all Lemmatizer pipe tests * Rename lookups in lemmatizer init test * Clean up merge * Refactor lookup table loading * Add helper from `load_lemmatizer_lookups` that loads required and optional lookups tables based on settings provided by a config. Additional slight refactor of lookups: * Add `Lookups.set_table` to set a table from a provided `Table` * Reorder class definitions to be able to specify type as `Table` * Move registry assets into test methods * Refactor lookups tables config Use class methods within `Lemmatizer` to provide the config for particular modes and to load the lookups from a config. * Add pipe and score to lemmatizer * Simplify Tagger.score * Add missing import * Clean up imports and auto-format * Remove unused kwarg * Tidy up and auto-format * Update docstrings for Lemmatizer Update docstrings for Lemmatizer. Additionally modify `is_base_form` API to take `Token` instead of individual features. * Update docstrings * Remove tag map values from Tagger.add_label * Update API docs * Fix relative link in Lemmatizer API docs --- spacy/default_config.cfg | 3 - spacy/errors.py | 7 +- spacy/lang/el/__init__.py | 47 ++- spacy/lang/el/lemmatizer.py | 37 +- spacy/lang/en/__init__.py | 49 ++- spacy/lang/en/lemmatizer.py | 69 ++-- spacy/lang/fr/__init__.py | 48 ++- spacy/lang/fr/lemmatizer.py | 141 +++----- spacy/lang/ja/__init__.py | 2 - spacy/lang/ko/__init__.py | 4 +- spacy/lang/nl/__init__.py | 46 ++- spacy/lang/nl/lemmatizer.py | 183 +++++----- spacy/lang/pl/__init__.py | 51 ++- spacy/lang/pl/lemmatizer.py | 48 +-- spacy/lang/ru/__init__.py | 41 +-- spacy/lang/ru/lemmatizer.py | 52 ++- spacy/lang/uk/__init__.py | 43 +-- spacy/lang/uk/lemmatizer.py | 195 +---------- spacy/language.py | 26 -- spacy/lemmatizer.py | 145 -------- spacy/lookups.py | 304 ++++++++-------- spacy/morphology.pxd | 6 - spacy/morphology.pyx | 151 +------- spacy/pipeline/__init__.py | 4 +- spacy/pipeline/lemmatizer.py | 330 ++++++++++++++++++ spacy/pipeline/tagger.pyx | 126 +------ spacy/schemas.py | 1 - spacy/tests/conftest.py | 2 +- spacy/tests/doc/test_creation.py | 21 +- spacy/tests/doc/test_morphanalysis.py | 20 +- spacy/tests/doc/test_retokenize_merge.py | 3 - spacy/tests/lang/en/test_tagger.py | 21 -- spacy/tests/lang/ru/test_lemmatizer.py | 69 ++-- spacy/tests/lang/test_lemmatizers.py | 34 ++ spacy/tests/morphology/test_morph_features.py | 5 +- spacy/tests/morphology/test_morph_pickle.py | 15 +- spacy/tests/parser/test_parse.py | 4 +- spacy/tests/pipeline/test_lemmatizer.py | 109 ++++++ spacy/tests/pipeline/test_tagger.py | 16 +- spacy/tests/regression/test_issue1-1000.py | 8 +- spacy/tests/regression/test_issue1001-1500.py | 16 +- spacy/tests/regression/test_issue1501-2000.py | 2 - spacy/tests/regression/test_issue2501-3000.py | 4 +- spacy/tests/regression/test_issue3001-3500.py | 4 +- spacy/tests/regression/test_issue3501-4000.py | 4 +- spacy/tests/regression/test_issue4001-4500.py | 1 + spacy/tests/regression/test_issue5230.py | 3 +- .../serialize/test_serialize_pipeline.py | 8 +- .../serialize/test_serialize_vocab_strings.py | 21 +- spacy/tests/test_lemmatizer.py | 64 ---- spacy/tests/tokenizer/test_tokenizer.py | 7 +- spacy/tokens/_retokenize.pyx | 6 +- spacy/tokens/doc.pyx | 18 +- spacy/tokens/token.pyx | 13 +- spacy/vocab.pyx | 22 +- website/docs/api/lemmatizer.md | 279 +++++++++++---- website/docs/api/morphology.md | 59 +--- website/docs/api/tagger.md | 29 +- website/docs/api/vocab.md | 2 - 59 files changed, 1439 insertions(+), 1609 deletions(-) delete mode 100644 spacy/lemmatizer.py create mode 100644 spacy/pipeline/lemmatizer.py delete mode 100644 spacy/tests/lang/en/test_tagger.py create mode 100644 spacy/tests/lang/test_lemmatizers.py create mode 100644 spacy/tests/pipeline/test_lemmatizer.py delete mode 100644 spacy/tests/test_lemmatizer.py diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 353924280..8aadad668 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -19,9 +19,6 @@ after_pipeline_creation = null [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" -[nlp.lemmatizer] -@lemmatizers = "spacy.Lemmatizer.v1" - [components] # Training hyper-parameters and additional features. diff --git a/spacy/errors.py b/spacy/errors.py index 7f47dd332..8e9a8d4b4 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -510,7 +510,7 @@ class Errors: E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") - E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.") + E955 = ("Can't find table(s) '{table}' for language '{lang}' in spacy-lookups-data.") E956 = ("Can't find component '{name}' in [components] block in the config. " "Available components: {opts}") E957 = ("Writing directly to Language.factories isn't needed anymore in " @@ -633,6 +633,11 @@ class Errors: E1001 = ("Target token outside of matched span for match with tokens " "'{span}' and offset '{index}' matched by patterns '{patterns}'.") E1002 = ("Span index out of range.") + E1003 = ("Unsupported lemmatizer mode '{mode}'.") + E1004 = ("Missing lemmatizer table(s) found for lemmatizer mode '{mode}'. " + "Required tables '{tables}', found '{found}'. If you are not " + "providing custom lookups, make sure you have the package " + "spacy-lookups-data installed.") @add_codes diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index c766c375e..0c5e0672b 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,38 +1,17 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from .lemmatizer import GreekLemmatizer from .syntax_iterators import SYNTAX_ITERATORS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES -from ...lookups import load_lookups +from .lemmatizer import GreekLemmatizer +from ...lookups import Lookups from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.el.GreekLemmatizer" -""" - - -@registry.lemmatizers("spacy.el.GreekLemmatizer") -def create_lemmatizer() -> Callable[[Language], GreekLemmatizer]: - tables = ["lemma_index", "lemma_exc", "lemma_rules"] - - def lemmatizer_factory(nlp: Language) -> GreekLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return GreekLemmatizer(lookups=lookups) - - return lemmatizer_factory class GreekDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES suffixes = TOKENIZER_SUFFIXES @@ -47,4 +26,22 @@ class Greek(Language): Defaults = GreekDefaults +@Greek.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = GreekLemmatizer.load_lookups(nlp.lang, mode, lookups) + return GreekLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Greek"] diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 809a23485..a049601dc 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -1,6 +1,7 @@ -from typing import Dict, List +from typing import List -from ...lemmatizer import Lemmatizer +from ...pipeline import Lemmatizer +from ...tokens import Token class GreekLemmatizer(Lemmatizer): @@ -14,13 +15,27 @@ class GreekLemmatizer(Lemmatizer): not applicable for Greek language. """ - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: + def rule_lemmatize(self, token: Token) -> List[str]: + """Lemmatize using a rule-based approach. + + token (Token): The token to lemmatize. + RETURNS (list): The available lemmas for the string. + """ + cache_key = (token.lower, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, {}) + string = string.lower() forms = [] if string in index: @@ -42,4 +57,6 @@ class GreekLemmatizer(Lemmatizer): forms.extend(oov_forms) if not forms: forms.append(string) - return list(set(forms)) + forms = list(set(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index 81200da27..1a595b6e7 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,39 +1,18 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from .lemmatizer import is_base_form from .punctuation import TOKENIZER_INFIXES +from .lemmatizer import EnglishLemmatizer from ...language import Language -from ...lemmatizer import Lemmatizer -from ...lookups import load_lookups -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.en.EnglishLemmatizer" -""" - - -@registry.lemmatizers("spacy.en.EnglishLemmatizer") -def create_lemmatizer() -> Callable[[Language], Lemmatizer]: - tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] - - def lemmatizer_factory(nlp: Language) -> Lemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return Lemmatizer(lookups=lookups, is_base_form=is_base_form) - - return lemmatizer_factory +from ...lookups import Lookups class EnglishDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS infixes = TOKENIZER_INFIXES lex_attr_getters = LEX_ATTRS @@ -46,4 +25,22 @@ class English(Language): Defaults = EnglishDefaults +@English.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = EnglishLemmatizer.load_lookups(nlp.lang, mode, lookups) + return EnglishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["English"] diff --git a/spacy/lang/en/lemmatizer.py b/spacy/lang/en/lemmatizer.py index 6d5db9e1e..b8bef39b9 100644 --- a/spacy/lang/en/lemmatizer.py +++ b/spacy/lang/en/lemmatizer.py @@ -1,36 +1,43 @@ from typing import Optional +from ...pipeline import Lemmatizer +from ...tokens import Token -def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool: - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - univ_pos (unicode / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. +class EnglishLemmatizer(Lemmatizer): + """English lemmatizer. Only overrides is_base_form. """ - if morphology is None: - morphology = {} - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif morphology.get("VerbForm") == "inf": - return True - elif morphology.get("VerbForm") == "none": - return True - elif morphology.get("Degree") == "pos": - return True - else: - return False + + def is_base_form(self, token: Token) -> bool: + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + + univ_pos (unicode / int): The token's universal part-of-speech tag. + morphology (dict): The token's morphological features following the + Universal Dependencies scheme. + """ + univ_pos = token.pos_.lower() + morphology = token.morph.to_dict() + if univ_pos == "noun" and morphology.get("Number") == "Sing": + return True + elif univ_pos == "verb" and morphology.get("VerbForm") == "Inf": + return True + # This maps 'VBP' to base form -- probably just need 'IS_BASE' + # morphology + elif univ_pos == "verb" and ( + morphology.get("VerbForm") == "Fin" + and morphology.get("Tense") == "Pres" + and morphology.get("Number") is None + ): + return True + elif univ_pos == "adj" and morphology.get("Degree") == "Pos": + return True + elif morphology.get("VerbForm") == "Inf": + return True + elif morphology.get("VerbForm") == "None": + return True + elif morphology.get("Degree") == "Pos": + return True + else: + return False diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index a5350d422..42241cd8a 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,5 +1,6 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES @@ -7,33 +8,12 @@ from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .syntax_iterators import SYNTAX_ITERATORS -from .lemmatizer import FrenchLemmatizer, is_base_form -from ...lookups import load_lookups +from .lemmatizer import FrenchLemmatizer +from ...lookups import Lookups from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.fr.FrenchLemmatizer" -""" - - -@registry.lemmatizers("spacy.fr.FrenchLemmatizer") -def create_lemmatizer() -> Callable[[Language], FrenchLemmatizer]: - tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - - def lemmatizer_factory(nlp: Language) -> FrenchLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return FrenchLemmatizer(lookups=lookups, is_base_form=is_base_form) - - return lemmatizer_factory class FrenchDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -49,4 +29,22 @@ class French(Language): Defaults = FrenchDefaults +@French.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = FrenchLemmatizer.load_lookups(nlp.lang, mode, lookups) + return FrenchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["French"] diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index e46ec1682..0dd782cc4 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,8 +1,7 @@ -from typing import Optional, List, Dict +from typing import List, Dict -from ...lemmatizer import Lemmatizer -from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP -from ...symbols import SCONJ, CCONJ +from ...pipeline import Lemmatizer +from ...tokens import Token class FrenchLemmatizer(Lemmatizer): @@ -15,65 +14,55 @@ class FrenchLemmatizer(Lemmatizer): the lookup table. """ - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - lookup_table = self.lookups.get_table("lemma_lookup", {}) - if "lemma_rules" not in self.lookups: - return [lookup_table.get(string, string)] - if univ_pos in (NOUN, "NOUN", "noun"): - univ_pos = "noun" - elif univ_pos in (VERB, "VERB", "verb"): - univ_pos = "verb" - elif univ_pos in (ADJ, "ADJ", "adj"): - univ_pos = "adj" - elif univ_pos in (ADP, "ADP", "adp"): - univ_pos = "adp" - elif univ_pos in (ADV, "ADV", "adv"): - univ_pos = "adv" - elif univ_pos in (AUX, "AUX", "aux"): - univ_pos = "aux" - elif univ_pos in (CCONJ, "CCONJ", "cconj"): - univ_pos = "cconj" - elif univ_pos in (DET, "DET", "det"): - univ_pos = "det" - elif univ_pos in (PRON, "PRON", "pron"): - univ_pos = "pron" - elif univ_pos in (PUNCT, "PUNCT", "punct"): - univ_pos = "punct" - elif univ_pos in (SCONJ, "SCONJ", "sconj"): - univ_pos = "sconj" + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + if mode == "rule": + return { + "required_tables": [ + "lemma_lookup", + "lemma_rules", + "lemma_exc", + "lemma_index", + ], + "optional_tables": [], + } else: - return [self.lookup(string)] + return super().get_lookups_config(mode) + + def rule_lemmatize(self, token: Token) -> List[str]: + cache_key = (token.orth, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + elif "lemma_rules" not in self.lookups or univ_pos not in ( + "noun", + "verb", + "adj", + "adp", + "adv", + "aux", + "cconj", + "det", + "pron", + "punct", + "sconj", + ): + return self.lookup_lemmatize(token) index_table = self.lookups.get_table("lemma_index", {}) exc_table = self.lookups.get_table("lemma_exc", {}) rules_table = self.lookups.get_table("lemma_rules", {}) - lemmas = self.lemmatize( - string, - index_table.get(univ_pos, {}), - exc_table.get(univ_pos, {}), - rules_table.get(univ_pos, []), - ) - return lemmas - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - lookup_table = self.lookups.get_table("lemma_lookup", {}) - if orth is not None and orth in lookup_table: - return lookup_table[orth][0] - return string - - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: lookup_table = self.lookups.get_table("lemma_lookup", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, []) string = string.lower() forms = [] if string in index: forms.append(string) + self.cache[cache_key] = forms return forms forms.extend(exceptions.get(string, [])) oov_forms = [] @@ -90,45 +79,9 @@ class FrenchLemmatizer(Lemmatizer): if not forms: forms.extend(oov_forms) if not forms and string in lookup_table.keys(): - forms.append(lookup_table[string][0]) + forms.append(self.lookup_lemmatize(token)[0]) if not forms: forms.append(string) - return list(set(forms)) - - -def is_base_form(univ_pos: str, morphology: Optional[dict] = None) -> bool: - """ - Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely. - """ - morphology = {} if morphology is None else morphology - others = [ - key - for key in morphology - if key not in (POS, "Number", "POS", "VerbForm", "Tense") - ] - if univ_pos == "noun" and morphology.get("Number") == "sing": - return True - elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": - return True - # This maps 'VBP' to base form -- probably just need 'IS_BASE' - # morphology - elif univ_pos == "verb" and ( - morphology.get("VerbForm") == "fin" - and morphology.get("Tense") == "pres" - and morphology.get("Number") is None - and not others - ): - return True - elif univ_pos == "adj" and morphology.get("Degree") == "pos": - return True - elif "VerbForm=inf" in morphology: - return True - elif "VerbForm=none" in morphology: - return True - elif "Number=sing" in morphology: - return True - elif "Degree=pos" in morphology: - return True - else: - return False + forms = list(set(forms)) + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 900db4e4c..051415455 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -38,8 +38,6 @@ def create_tokenizer(split_mode: Optional[str] = None): class JapaneseTokenizer(DummyTokenizer): def __init__(self, nlp: Language, split_mode: Optional[str] = None) -> None: self.vocab = nlp.vocab - # TODO: is this the right way to do it? - self.vocab.morphology.load_tag_map(TAG_MAP) self.split_mode = split_mode self.tokenizer = try_sudachi_import(self.split_mode) diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index f2954f461..47a3887a6 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -7,6 +7,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language from ...tokens import Doc from ...compat import copy_reg +from ...symbols import POS from ...util import DummyTokenizer, registry @@ -29,8 +30,6 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, nlp: Optional[Language] = None): self.vocab = nlp.vocab - # TODO: is this the right way to do it? - self.vocab.morphology.load_tag_map(TAG_MAP) MeCab = try_mecab_import() self.mecab_tokenizer = MeCab("-F%f[0],%f[7]") @@ -44,6 +43,7 @@ class KoreanTokenizer(DummyTokenizer): for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + token.pos = TAG_MAP[token.tag_][POS] token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index d874ef7a1..1526e41f5 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,5 +1,6 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS @@ -7,32 +8,11 @@ from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES from .lemmatizer import DutchLemmatizer -from ...lookups import load_lookups +from ...lookups import Lookups from ...language import Language -from ...util import registry - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.nl.DutchLemmatizer" -""" - - -@registry.lemmatizers("spacy.nl.DutchLemmatizer") -def create_lemmatizer() -> Callable[[Language], DutchLemmatizer]: - tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - - def lemmatizer_factory(nlp: Language) -> DutchLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return DutchLemmatizer(lookups=lookups) - - return lemmatizer_factory class DutchDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -46,4 +26,22 @@ class Dutch(Language): Defaults = DutchDefaults +@Dutch.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "rule", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = DutchLemmatizer.load_lookups(nlp.lang, mode, lookups) + return DutchLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Dutch"] diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index b01debaa9..42b97a862 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,44 +1,34 @@ -from typing import Optional, List, Dict, Tuple +from typing import List, Dict -from ...lemmatizer import Lemmatizer -from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV +from ...pipeline import Lemmatizer +from ...tokens import Token class DutchLemmatizer(Lemmatizer): - # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. - univ_pos_name_variants = { - NOUN: "noun", - "NOUN": "noun", - "noun": "noun", - VERB: "verb", - "VERB": "verb", - "verb": "verb", - AUX: "verb", - "AUX": "verb", - "aux": "verb", - ADJ: "adj", - "ADJ": "adj", - "adj": "adj", - ADV: "adv", - "ADV": "adv", - "adv": "adv", - PRON: "pron", - "PRON": "pron", - "pron": "pron", - DET: "det", - "DET": "det", - "det": "det", - ADP: "adp", - "ADP": "adp", - "adp": "adp", - NUM: "num", - "NUM": "num", - "num": "num", - } + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + if mode == "rule": + return { + "required_tables": [ + "lemma_lookup", + "lemma_rules", + "lemma_exc", + "lemma_index", + ], + } + else: + return super().get_lookups_config(mode) - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: + def lookup_lemmatize(self, token: Token) -> List[str]: + """Overrides parent method so that a lowercased version of the string + is used to search the lookup table. This is necessary because our + lookup table consists entirely of lowercase keys.""" + lookup_table = self.lookups.get_table("lemma_lookup", {}) + string = token.text.lower() + return [lookup_table.get(string, string)] + + # Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB. + def rule_lemmatize(self, token: Token) -> List[str]: # Difference 1: self.rules is assumed to be non-None, so no # 'is None' check required. # String lowercased from the get-go. All lemmatization results in @@ -46,74 +36,61 @@ class DutchLemmatizer(Lemmatizer): # any problems, and it keeps the exceptions indexes small. If this # creates problems for proper nouns, we can introduce a check for # univ_pos == "PROPN". - string = string.lower() - try: - univ_pos = self.univ_pos_name_variants[univ_pos] - except KeyError: - # Because PROPN not in self.univ_pos_name_variants, proper names - # are not lemmatized. They are lowercased, however. - return [string] - # if string in self.lemma_index.get(univ_pos) + cache_key = (token.lower, token.pos) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + forms = [string.lower()] + self.cache[cache_key] = forms + return forms + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, {}) + + string = string.lower() + if univ_pos not in ( + "noun", + "verb", + "aux", + "adj", + "adv", + "pron", + "det", + "adp", + "num", + ): + forms = [string] + self.cache[cache_key] = forms + return forms lemma_index = index_table.get(univ_pos, {}) # string is already lemma if string in lemma_index: - return [string] + forms = [string] + self.cache[cache_key] = forms + return forms exc_table = self.lookups.get_table("lemma_exc", {}) exceptions = exc_table.get(univ_pos, {}) # string is irregular token contained in exceptions index. try: - lemma = exceptions[string] - return [lemma[0]] + forms = [exceptions[string][0]] + self.cache[cache_key] = forms + return forms except KeyError: pass # string corresponds to key in lookup table lookup_table = self.lookups.get_table("lemma_lookup", {}) looked_up_lemma = lookup_table.get(string) if looked_up_lemma and looked_up_lemma in lemma_index: - return [looked_up_lemma] + forms = [looked_up_lemma] + self.cache[cache_key] = forms + return forms rules_table = self.lookups.get_table("lemma_rules", {}) - forms, is_known = self.lemmatize( - string, lemma_index, exceptions, rules_table.get(univ_pos, []) - ) - # Back-off through remaining return value candidates. - if forms: - if is_known: - return forms - else: - for form in forms: - if form in exceptions: - return [form] - if looked_up_lemma: - return [looked_up_lemma] - else: - return forms - elif looked_up_lemma: - return [looked_up_lemma] - else: - return [string] - - # Overrides parent method so that a lowercased version of the string is - # used to search the lookup table. This is necessary because our lookup - # table consists entirely of lowercase keys. - def lookup(self, string: str, orth: Optional[int] = None) -> str: - lookup_table = self.lookups.get_table("lemma_lookup", {}) - string = string.lower() - if orth is not None: - return lookup_table.get(orth, string) - else: - return lookup_table.get(string, string) - - # Reimplemented to focus more on application of suffix rules and to return - # as early as possible. - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> Tuple[List[str], bool]: - # returns (forms, is_known: bool) oov_forms = [] for old, new in rules: if string.endswith(old): @@ -121,7 +98,31 @@ class DutchLemmatizer(Lemmatizer): if not form: pass elif form in index: - return [form], True # True = Is known (is lemma) + forms = [form] + self.cache[cache_key] = forms + return forms else: oov_forms.append(form) - return list(set(oov_forms)), False + forms = list(set(oov_forms)) + # Back-off through remaining return value candidates. + if forms: + for form in forms: + if form in exceptions: + forms = [form] + self.cache[cache_key] = forms + return forms + if looked_up_lemma: + forms = [looked_up_lemma] + self.cache[cache_key] = forms + return forms + else: + self.cache[cache_key] = forms + return forms + elif looked_up_lemma: + forms = [looked_up_lemma] + self.cache[cache_key] = forms + return forms + else: + forms = [string] + self.cache[cache_key] = forms + return forms diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 2393f1aea..a180fa6e9 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,5 +1,6 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES @@ -7,42 +8,16 @@ from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .lemmatizer import PolishLemmatizer from ..tokenizer_exceptions import BASE_EXCEPTIONS -from ...lookups import load_lookups +from ...lookups import Lookups from ...language import Language -from ...util import registry -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.pl.PolishLemmatizer" -""" - TOKENIZER_EXCEPTIONS = { exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".") } -@registry.lemmatizers("spacy.pl.PolishLemmatizer") -def create_lemmatizer() -> Callable[[Language], PolishLemmatizer]: - # fmt: off - tables = [ - "lemma_lookup_adj", "lemma_lookup_adp", "lemma_lookup_adv", - "lemma_lookup_aux", "lemma_lookup_noun", "lemma_lookup_num", - "lemma_lookup_part", "lemma_lookup_pron", "lemma_lookup_verb" - ] - # fmt: on - - def lemmatizer_factory(nlp: Language) -> PolishLemmatizer: - lookups = load_lookups(lang=nlp.lang, tables=tables) - return PolishLemmatizer(lookups=lookups) - - return lemmatizer_factory - - class PolishDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS prefixes = TOKENIZER_PREFIXES infixes = TOKENIZER_INFIXES @@ -56,4 +31,22 @@ class Polish(Language): Defaults = PolishDefaults +@Polish.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "lookup", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + lookups = PolishLemmatizer.load_lookups(nlp.lang, mode, lookups) + return PolishLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Polish"] diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index 8e96dd75b..c4c6db06a 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -1,7 +1,7 @@ -from typing import Optional, List, Dict +from typing import List, Dict -from ...lemmatizer import Lemmatizer -from ...parts_of_speech import NAMES +from ...pipeline import Lemmatizer +from ...tokens import Token class PolishLemmatizer(Lemmatizer): @@ -9,12 +9,30 @@ class PolishLemmatizer(Lemmatizer): # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. # It utilizes some prefix based improvements for verb and adjectives # lemmatization, as well as case-sensitive lemmatization for nouns. - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - if isinstance(univ_pos, int): - univ_pos = NAMES.get(univ_pos, "X") - univ_pos = univ_pos.upper() + + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + if mode == "lookup": + return { + "required_tables": [ + "lemma_lookup_adj", + "lemma_lookup_adp", + "lemma_lookup_adv", + "lemma_lookup_aux", + "lemma_lookup_noun", + "lemma_lookup_num", + "lemma_lookup_part", + "lemma_lookup_pron", + "lemma_lookup_verb", + ] + } + else: + return super().get_lookups_config(mode) + + def lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() lookup_pos = univ_pos.lower() if univ_pos == "PROPN": lookup_pos = "noun" @@ -71,15 +89,3 @@ class PolishLemmatizer(Lemmatizer): return [lookup_table[string]] return [string.lower()] return [lookup_table.get(string, string)] - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - return string.lower() - - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: - raise NotImplementedError diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 5d2333edf..be770e3ec 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,32 +1,16 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .lex_attrs import LEX_ATTRS from .lemmatizer import RussianLemmatizer -from ...util import registry from ...language import Language - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.ru.RussianLemmatizer" -""" - - -@registry.lemmatizers("spacy.ru.RussianLemmatizer") -def create_lemmatizer() -> Callable[[Language], RussianLemmatizer]: - def lemmatizer_factory(nlp: Language) -> RussianLemmatizer: - return RussianLemmatizer() - - return lemmatizer_factory +from ...lookups import Lookups class RussianDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -37,4 +21,21 @@ class Russian(Language): Defaults = RussianDefaults +@Russian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + return RussianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Russian"] diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 28767348d..8d7996c63 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,8 +1,12 @@ -from typing import Optional, Tuple, Dict, List +from typing import Optional, List, Dict, Tuple + +from thinc.api import Model -from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS -from ...lemmatizer import Lemmatizer from ...lookups import Lookups +from ...pipeline import Lemmatizer +from ...symbols import POS +from ...tokens import Token +from ...vocab import Vocab PUNCT_RULES = {"«": '"', "»": '"'} @@ -11,8 +15,17 @@ PUNCT_RULES = {"«": '"', "»": '"'} class RussianLemmatizer(Lemmatizer): _morph = None - def __init__(self, lookups: Optional[Lookups] = None) -> None: - super(RussianLemmatizer, self).__init__(lookups) + def __init__( + self, + vocab: Vocab, + model: Optional[Model], + name: str = "lemmatizer", + *, + mode: str = "pymorphy2", + lookups: Optional[Lookups] = None, + ) -> None: + super().__init__(vocab, model, name, mode=mode, lookups=lookups) + try: from pymorphy2 import MorphAnalyzer except ImportError: @@ -25,10 +38,10 @@ class RussianLemmatizer(Lemmatizer): if RussianLemmatizer._morph is None: RussianLemmatizer._morph = MorphAnalyzer() - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - univ_pos = self.normalize_univ_pos(univ_pos) + def pymorphy2_lemmatize(self, token: Token) -> List[str]: + string = token.text + univ_pos = token.pos_ + morphology = token.morph.to_dict() if univ_pos == "PUNCT": return [PUNCT_RULES.get(string, string)] if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): @@ -81,25 +94,8 @@ class RussianLemmatizer(Lemmatizer): return [string.lower()] return list(set([analysis.normal_form for analysis in filtered_analyses])) - @staticmethod - def normalize_univ_pos(univ_pos: str) -> Optional[str]: - if isinstance(univ_pos, str): - return univ_pos.upper() - symbols_to_str = { - ADJ: "ADJ", - DET: "DET", - NOUN: "NOUN", - NUM: "NUM", - PRON: "PRON", - PROPN: "PROPN", - PUNCT: "PUNCT", - VERB: "VERB", - } - if univ_pos in symbols_to_str: - return symbols_to_str[univ_pos] - return None - - def lookup(self, string: str, orth: Optional[int] = None) -> str: + def lookup_lemmatize(self, token: Token) -> List[str]: + string = token.text analyses = self._morph.parse(string) if len(analyses) == 1: return analyses[0].normal_form diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index 6b44a7144..e9936cf7d 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,32 +1,16 @@ -from typing import Callable -from thinc.api import Config +from typing import Optional + +from thinc.api import Model from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS -from ...util import registry -from ...language import Language from .lemmatizer import UkrainianLemmatizer - - -DEFAULT_CONFIG = """ -[nlp] - -[nlp.lemmatizer] -@lemmatizers = "spacy.uk.UkrainianLemmatizer" -""" - - -@registry.lemmatizers("spacy.uk.UkrainianLemmatizer") -def create_ukrainian_lemmatizer() -> Callable[[Language], UkrainianLemmatizer]: - def lemmatizer_factory(nlp: Language) -> UkrainianLemmatizer: - return UkrainianLemmatizer() - - return lemmatizer_factory +from ...language import Language +from ...lookups import Lookups class UkrainianDefaults(Language.Defaults): - config = Config().from_str(DEFAULT_CONFIG) tokenizer_exceptions = TOKENIZER_EXCEPTIONS lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS @@ -37,4 +21,21 @@ class Ukrainian(Language): Defaults = UkrainianDefaults +@Ukrainian.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={"model": None, "mode": "pymorphy2", "lookups": None}, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], +): + return UkrainianLemmatizer(nlp.vocab, model, name, mode=mode, lookups=lookups) + + __all__ = ["Ukrainian"] diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index cf89d1a12..0d6febce6 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,187 +1,30 @@ -from typing import Optional, List, Tuple, Dict +from typing import Optional -from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS +from thinc.api import Model + +from ..ru.lemmatizer import RussianLemmatizer from ...lookups import Lookups -from ...lemmatizer import Lemmatizer +from ...vocab import Vocab -PUNCT_RULES = {"«": '"', "»": '"'} - - -class UkrainianLemmatizer(Lemmatizer): - _morph = None - - def __init__(self, lookups: Optional[Lookups] = None) -> None: - super(UkrainianLemmatizer, self).__init__(lookups) +class UkrainianLemmatizer(RussianLemmatizer): + def __init__( + self, + vocab: Vocab, + model: Optional[Model], + name: str = "lemmatizer", + *, + mode: str = "pymorphy2", + lookups: Optional[Lookups] = None, + ) -> None: + super().__init__(vocab, model, name, mode=mode, lookups=lookups) try: from pymorphy2 import MorphAnalyzer - - if UkrainianLemmatizer._morph is None: - UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") - except (ImportError, TypeError): + except ImportError: raise ImportError( "The Ukrainian lemmatizer requires the pymorphy2 library and " 'dictionaries: try to fix it with "pip uninstall pymorphy2" and' '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"' ) from None - - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - univ_pos = self.normalize_univ_pos(univ_pos) - if univ_pos == "PUNCT": - return [PUNCT_RULES.get(string, string)] - if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): - # Skip unchangeable pos - return [string.lower()] - analyses = self._morph.parse(string) - filtered_analyses = [] - for analysis in analyses: - if not analysis.is_known: - # Skip suggested parse variant for unknown word for pymorphy - continue - analysis_pos, _ = oc2ud(str(analysis.tag)) - if analysis_pos == univ_pos or ( - analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN") - ): - filtered_analyses.append(analysis) - if not len(filtered_analyses): - return [string.lower()] - if morphology is None or (len(morphology) == 1 and POS in morphology): - return list(set([analysis.normal_form for analysis in filtered_analyses])) - if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): - features_to_compare = ["Case", "Number", "Gender"] - elif univ_pos == "NUM": - features_to_compare = ["Case", "Gender"] - elif univ_pos == "PRON": - features_to_compare = ["Case", "Number", "Gender", "Person"] - else: # VERB - features_to_compare = [ - "Aspect", - "Gender", - "Mood", - "Number", - "Tense", - "VerbForm", - "Voice", - ] - analyses, filtered_analyses = filtered_analyses, [] - for analysis in analyses: - _, analysis_morph = oc2ud(str(analysis.tag)) - for feature in features_to_compare: - if ( - feature in morphology - and feature in analysis_morph - and morphology[feature].lower() != analysis_morph[feature].lower() - ): - break - else: - filtered_analyses.append(analysis) - if not len(filtered_analyses): - return [string.lower()] - return list(set([analysis.normal_form for analysis in filtered_analyses])) - - @staticmethod - def normalize_univ_pos(univ_pos: str) -> Optional[str]: - if isinstance(univ_pos, str): - return univ_pos.upper() - symbols_to_str = { - ADJ: "ADJ", - DET: "DET", - NOUN: "NOUN", - NUM: "NUM", - PRON: "PRON", - PROPN: "PROPN", - PUNCT: "PUNCT", - VERB: "VERB", - } - if univ_pos in symbols_to_str: - return symbols_to_str[univ_pos] - return None - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - analyses = self._morph.parse(string) - if len(analyses) == 1: - return analyses[0].normal_form - return string - - -def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]: - gram_map = { - "_POS": { - "ADJF": "ADJ", - "ADJS": "ADJ", - "ADVB": "ADV", - "Apro": "DET", - "COMP": "ADJ", # Can also be an ADV - unchangeable - "CONJ": "CCONJ", # Can also be a SCONJ - both unchangeable ones - "GRND": "VERB", - "INFN": "VERB", - "INTJ": "INTJ", - "NOUN": "NOUN", - "NPRO": "PRON", - "NUMR": "NUM", - "NUMB": "NUM", - "PNCT": "PUNCT", - "PRCL": "PART", - "PREP": "ADP", - "PRTF": "VERB", - "PRTS": "VERB", - "VERB": "VERB", - }, - "Animacy": {"anim": "Anim", "inan": "Inan"}, - "Aspect": {"impf": "Imp", "perf": "Perf"}, - "Case": { - "ablt": "Ins", - "accs": "Acc", - "datv": "Dat", - "gen1": "Gen", - "gen2": "Gen", - "gent": "Gen", - "loc2": "Loc", - "loct": "Loc", - "nomn": "Nom", - "voct": "Voc", - }, - "Degree": {"COMP": "Cmp", "Supr": "Sup"}, - "Gender": {"femn": "Fem", "masc": "Masc", "neut": "Neut"}, - "Mood": {"impr": "Imp", "indc": "Ind"}, - "Number": {"plur": "Plur", "sing": "Sing"}, - "NumForm": {"NUMB": "Digit"}, - "Person": {"1per": "1", "2per": "2", "3per": "3", "excl": "2", "incl": "1"}, - "Tense": {"futr": "Fut", "past": "Past", "pres": "Pres"}, - "Variant": {"ADJS": "Brev", "PRTS": "Brev"}, - "VerbForm": { - "GRND": "Conv", - "INFN": "Inf", - "PRTF": "Part", - "PRTS": "Part", - "VERB": "Fin", - }, - "Voice": {"actv": "Act", "pssv": "Pass"}, - "Abbr": {"Abbr": "Yes"}, - } - pos = "X" - morphology = dict() - unmatched = set() - grams = oc_tag.replace(" ", ",").split(",") - for gram in grams: - match = False - for categ, gmap in sorted(gram_map.items()): - if gram in gmap: - match = True - if categ == "_POS": - pos = gmap[gram] - else: - morphology[categ] = gmap[gram] - if not match: - unmatched.add(gram) - while len(unmatched) > 0: - gram = unmatched.pop() - if gram in ("Name", "Patr", "Surn", "Geox", "Orgn"): - pos = "PROPN" - elif gram == "Auxt": - pos = "AUX" - elif gram == "Pltm": - morphology["Number"] = "Ptan" - return pos, morphology + if UkrainianLemmatizer._morph is None: + UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk") diff --git a/spacy/language.py b/spacy/language.py index 9018af73c..96661915a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -29,7 +29,6 @@ from .lang.punctuation import TOKENIZER_INFIXES from .tokens import Doc from .lookups import load_lookups from .tokenizer import Tokenizer -from .lemmatizer import Lemmatizer from .errors import Errors, Warnings from .schemas import ConfigSchema from .git_info import GIT_VERSION @@ -87,22 +86,6 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]: return tokenizer_factory -@registry.lemmatizers("spacy.Lemmatizer.v1") -def create_lemmatizer() -> Callable[["Language"], "Lemmatizer"]: - """Registered function to create a lemmatizer. Returns a factory that takes - the nlp object and returns a Lemmatizer instance with data loaded in from - spacy-lookups-data, if the package is installed. - """ - # TODO: Will be replaced when the lemmatizer becomes a pipeline component - tables = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"] - - def lemmatizer_factory(nlp: "Language") -> "Lemmatizer": - lookups = load_lookups(lang=nlp.lang, tables=tables, strict=False) - return Lemmatizer(lookups=lookups) - - return lemmatizer_factory - - class Language: """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. @@ -128,7 +111,6 @@ class Language: max_length: int = 10 ** 6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, - create_lemmatizer: Optional[Callable[["Language"], Callable]] = None, **kwargs, ) -> None: """Initialise a Language object. @@ -146,8 +128,6 @@ class Language: 100,000 characters in one text. create_tokenizer (Callable): Function that takes the nlp object and returns a tokenizer. - create_lemmatizer (Callable): Function that takes the nlp object and - returns a lemmatizer. DOCS: https://spacy.io/api/language#init """ @@ -166,13 +146,9 @@ class Language: if vocab is True: vectors_name = meta.get("vectors", {}).get("name") - if not create_lemmatizer: - lemma_cfg = {"lemmatizer": self._config["nlp"]["lemmatizer"]} - create_lemmatizer = registry.make_from_config(lemma_cfg)["lemmatizer"] vocab = create_vocab( self.lang, self.Defaults, - lemmatizer=create_lemmatizer(self), vectors_name=vectors_name, load_data=self._config["nlp"]["load_vocab_data"], ) @@ -1451,7 +1427,6 @@ class Language: filled["components"] = orig_pipeline config["components"] = orig_pipeline create_tokenizer = resolved["nlp"]["tokenizer"] - create_lemmatizer = resolved["nlp"]["lemmatizer"] before_creation = resolved["nlp"]["before_creation"] after_creation = resolved["nlp"]["after_creation"] after_pipeline_creation = resolved["nlp"]["after_pipeline_creation"] @@ -1467,7 +1442,6 @@ class Language: nlp = lang_cls( vocab=vocab, create_tokenizer=create_tokenizer, - create_lemmatizer=create_lemmatizer, ) if after_creation is not None: nlp = after_creation(nlp) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py deleted file mode 100644 index adba79686..000000000 --- a/spacy/lemmatizer.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import Optional, Callable, List, Dict - -from .lookups import Lookups -from .parts_of_speech import NAMES as UPOS_NAMES - - -class Lemmatizer: - """ - The Lemmatizer supports simple part-of-speech-sensitive suffix rules and - lookup tables. - - DOCS: https://spacy.io/api/lemmatizer - """ - - def __init__( - self, - lookups: Optional[Lookups] = None, - is_base_form: Optional[Callable] = None, - ) -> None: - """Initialize a Lemmatizer. - - lookups (Lookups): The lookups object containing the (optional) tables - "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". - """ - self.lookups = lookups if lookups is not None else Lookups() - self.is_base_form = is_base_form - - def __call__( - self, string: str, univ_pos: str, morphology: Optional[dict] = None - ) -> List[str]: - """Lemmatize a string. - - string (str): The string to lemmatize, e.g. the token text. - univ_pos (str / int): The token's universal part-of-speech tag. - morphology (dict): The token's morphological features following the - Universal Dependencies scheme. - RETURNS (list): The available lemmas for the string. - """ - lookup_table = self.lookups.get_table("lemma_lookup", {}) - if "lemma_rules" not in self.lookups: - return [lookup_table.get(string, string)] - if isinstance(univ_pos, int): - univ_pos = UPOS_NAMES.get(univ_pos, "X") - univ_pos = univ_pos.lower() - if univ_pos in ("", "eol", "space"): - return [string.lower()] - # See Issue #435 for example of where this logic is requied. - if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology): - return [string.lower()] - index_table = self.lookups.get_table("lemma_index", {}) - exc_table = self.lookups.get_table("lemma_exc", {}) - rules_table = self.lookups.get_table("lemma_rules", {}) - if not any( - ( - index_table.get(univ_pos), - exc_table.get(univ_pos), - rules_table.get(univ_pos), - ) - ): - if univ_pos == "propn": - return [string] - else: - return [string.lower()] - lemmas = self.lemmatize( - string, - index_table.get(univ_pos, {}), - exc_table.get(univ_pos, {}), - rules_table.get(univ_pos, []), - ) - return lemmas - - def noun(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "noun", morphology) - - def verb(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "verb", morphology) - - def adj(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "adj", morphology) - - def det(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "det", morphology) - - def pron(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "pron", morphology) - - def adp(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "adp", morphology) - - def num(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "num", morphology) - - def punct(self, string: str, morphology: Optional[dict] = None) -> List[str]: - return self(string, "punct", morphology) - - def lookup(self, string: str, orth: Optional[int] = None) -> str: - """Look up a lemma in the table, if available. If no lemma is found, - the original string is returned. - - string (str): The original string. - orth (int): Optional hash of the string to look up. If not set, the - string will be used and hashed. - RETURNS (str): The lemma if the string was found, otherwise the - original string. - """ - lookup_table = self.lookups.get_table("lemma_lookup", {}) - key = orth if orth is not None else string - if key in lookup_table: - return lookup_table[key] - return string - - def lemmatize( - self, - string: str, - index: Dict[str, List[str]], - exceptions: Dict[str, Dict[str, List[str]]], - rules: Dict[str, List[List[str]]], - ) -> List[str]: - orig = string - string = string.lower() - forms = [] - oov_forms = [] - for old, new in rules: - if string.endswith(old): - form = string[: len(string) - len(old)] + new - if not form: - pass - elif form in index or not form.isalpha(): - forms.append(form) - else: - oov_forms.append(form) - # Remove duplicates but preserve the ordering of applied "rules" - forms = list(dict.fromkeys(forms)) - # Put exceptions at the front of the list, so they get priority. - # This is a dodgy heuristic -- but it's the best we can do until we get - # frequencies on this. We can at least prune out problematic exceptions, - # if they shadow more frequent analyses. - for form in exceptions.get(string, []): - if form not in forms: - forms.insert(0, form) - if not forms: - forms.extend(oov_forms) - if not forms: - forms.append(orig) - return forms diff --git a/spacy/lookups.py b/spacy/lookups.py index 7862b9805..d79a5b950 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -28,6 +28,8 @@ def load_lookups( # TODO: import spacy_lookups_data instead of going via entry points here? lookups = Lookups() if lang not in registry.lookups: + if strict and len(tables) > 0: + raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang)) return lookups data = registry.lookups.get(lang) for table in tables: @@ -41,152 +43,6 @@ def load_lookups( return lookups -class Lookups: - """Container for large lookup tables and dictionaries, e.g. lemmatization - data or tokenizer exception lists. Lookups are available via vocab.lookups, - so they can be accessed before the pipeline components are applied (e.g. - in the tokenizer and lemmatizer), as well as within the pipeline components - via doc.vocab.lookups. - """ - - def __init__(self) -> None: - """Initialize the Lookups object. - - DOCS: https://spacy.io/api/lookups#init - """ - self._tables = {} - - def __contains__(self, name: str) -> bool: - """Check if the lookups contain a table of a given name. Delegates to - Lookups.has_table. - - name (str): Name of the table. - RETURNS (bool): Whether a table of that name is in the lookups. - """ - return self.has_table(name) - - def __len__(self) -> int: - """RETURNS (int): The number of tables in the lookups.""" - return len(self._tables) - - @property - def tables(self) -> List[str]: - """RETURNS (List[str]): Names of all tables in the lookups.""" - return list(self._tables.keys()) - - def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> "Table": - """Add a new table to the lookups. Raises an error if the table exists. - - name (str): Unique name of table. - data (dict): Optional data to add to the table. - RETURNS (Table): The newly added table. - - DOCS: https://spacy.io/api/lookups#add_table - """ - if name in self.tables: - raise ValueError(Errors.E158.format(name=name)) - table = Table(name=name, data=data) - self._tables[name] = table - return table - - def get_table(self, name: str, default: Any = UNSET) -> "Table": - """Get a table. Raises an error if the table doesn't exist and no - default value is provided. - - name (str): Name of the table. - default (Any): Optional default value to return if table doesn't exist. - RETURNS (Table): The table. - - DOCS: https://spacy.io/api/lookups#get_table - """ - if name not in self._tables: - if default == UNSET: - raise KeyError(Errors.E159.format(name=name, tables=self.tables)) - return default - return self._tables[name] - - def remove_table(self, name: str) -> "Table": - """Remove a table. Raises an error if the table doesn't exist. - - name (str): Name of the table to remove. - RETURNS (Table): The removed table. - - DOCS: https://spacy.io/api/lookups#remove_table - """ - if name not in self._tables: - raise KeyError(Errors.E159.format(name=name, tables=self.tables)) - return self._tables.pop(name) - - def has_table(self, name: str) -> bool: - """Check if the lookups contain a table of a given name. - - name (str): Name of the table. - RETURNS (bool): Whether a table of that name exists. - - DOCS: https://spacy.io/api/lookups#has_table - """ - return name in self._tables - - def to_bytes(self, **kwargs) -> bytes: - """Serialize the lookups to a bytestring. - - RETURNS (bytes): The serialized Lookups. - - DOCS: https://spacy.io/api/lookups#to_bytes - """ - return srsly.msgpack_dumps(self._tables) - - def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups": - """Load the lookups from a bytestring. - - bytes_data (bytes): The data to load. - RETURNS (Lookups): The loaded Lookups. - - DOCS: https://spacy.io/api/lookups#from_bytes - """ - self._tables = {} - for key, value in srsly.msgpack_loads(bytes_data).items(): - self._tables[key] = Table(key, value) - return self - - def to_disk( - self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs - ) -> None: - """Save the lookups to a directory as lookups.bin. Expects a path to a - directory, which will be created if it doesn't exist. - - path (str / Path): The file path. - - DOCS: https://spacy.io/api/lookups#to_disk - """ - if len(self._tables): - path = ensure_path(path) - if not path.exists(): - path.mkdir() - filepath = path / filename - with filepath.open("wb") as file_: - file_.write(self.to_bytes()) - - def from_disk( - self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs - ) -> "Lookups": - """Load lookups from a directory containing a lookups.bin. Will skip - loading if the file doesn't exist. - - path (str / Path): The directory path. - RETURNS (Lookups): The loaded lookups. - - DOCS: https://spacy.io/api/lookups#from_disk - """ - path = ensure_path(path) - filepath = path / filename - if filepath.exists(): - with filepath.open("rb") as file_: - data = file_.read() - return self.from_bytes(data) - return self - - class Table(OrderedDict): """A table in the lookups. Subclass of builtin dict that implements a slightly more consistent and unified API. @@ -303,3 +159,159 @@ class Table(OrderedDict): self.clear() self.update(data) return self + + +class Lookups: + """Container for large lookup tables and dictionaries, e.g. lemmatization + data or tokenizer exception lists. Lookups are available via vocab.lookups, + so they can be accessed before the pipeline components are applied (e.g. + in the tokenizer and lemmatizer), as well as within the pipeline components + via doc.vocab.lookups. + """ + + def __init__(self) -> None: + """Initialize the Lookups object. + + DOCS: https://spacy.io/api/lookups#init + """ + self._tables = {} + + def __contains__(self, name: str) -> bool: + """Check if the lookups contain a table of a given name. Delegates to + Lookups.has_table. + + name (str): Name of the table. + RETURNS (bool): Whether a table of that name is in the lookups. + """ + return self.has_table(name) + + def __len__(self) -> int: + """RETURNS (int): The number of tables in the lookups.""" + return len(self._tables) + + @property + def tables(self) -> List[str]: + """RETURNS (List[str]): Names of all tables in the lookups.""" + return list(self._tables.keys()) + + def add_table(self, name: str, data: dict = SimpleFrozenDict()) -> Table: + """Add a new table to the lookups. Raises an error if the table exists. + + name (str): Unique name of table. + data (dict): Optional data to add to the table. + RETURNS (Table): The newly added table. + + DOCS: https://spacy.io/api/lookups#add_table + """ + if name in self.tables: + raise ValueError(Errors.E158.format(name=name)) + table = Table(name=name, data=data) + self._tables[name] = table + return table + + def set_table(self, name: str, table: Table) -> None: + """Set a table. + + name (str): Name of the table to set. + table (Table): The Table to set. + + DOCS: https://spacy.io/api/lookups#set_table + """ + self._tables[name] = table + + def get_table(self, name: str, default: Any = UNSET) -> Table: + """Get a table. Raises an error if the table doesn't exist and no + default value is provided. + + name (str): Name of the table. + default (Any): Optional default value to return if table doesn't exist. + RETURNS (Table): The table. + + DOCS: https://spacy.io/api/lookups#get_table + """ + if name not in self._tables: + if default == UNSET: + raise KeyError(Errors.E159.format(name=name, tables=self.tables)) + return default + return self._tables[name] + + def remove_table(self, name: str) -> Table: + """Remove a table. Raises an error if the table doesn't exist. + + name (str): Name of the table to remove. + RETURNS (Table): The removed table. + + DOCS: https://spacy.io/api/lookups#remove_table + """ + if name not in self._tables: + raise KeyError(Errors.E159.format(name=name, tables=self.tables)) + return self._tables.pop(name) + + def has_table(self, name: str) -> bool: + """Check if the lookups contain a table of a given name. + + name (str): Name of the table. + RETURNS (bool): Whether a table of that name exists. + + DOCS: https://spacy.io/api/lookups#has_table + """ + return name in self._tables + + def to_bytes(self, **kwargs) -> bytes: + """Serialize the lookups to a bytestring. + + RETURNS (bytes): The serialized Lookups. + + DOCS: https://spacy.io/api/lookups#to_bytes + """ + return srsly.msgpack_dumps(self._tables) + + def from_bytes(self, bytes_data: bytes, **kwargs) -> "Lookups": + """Load the lookups from a bytestring. + + bytes_data (bytes): The data to load. + RETURNS (Lookups): The loaded Lookups. + + DOCS: https://spacy.io/api/lookups#from_bytes + """ + self._tables = {} + for key, value in srsly.msgpack_loads(bytes_data).items(): + self._tables[key] = Table(key, value) + return self + + def to_disk( + self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs + ) -> None: + """Save the lookups to a directory as lookups.bin. Expects a path to a + directory, which will be created if it doesn't exist. + + path (str / Path): The file path. + + DOCS: https://spacy.io/api/lookups#to_disk + """ + if len(self._tables): + path = ensure_path(path) + if not path.exists(): + path.mkdir() + filepath = path / filename + with filepath.open("wb") as file_: + file_.write(self.to_bytes()) + + def from_disk( + self, path: Union[str, Path], filename: str = "lookups.bin", **kwargs + ) -> "Lookups": + """Load lookups from a directory containing a lookups.bin. Will skip + loading if the file doesn't exist. + + path (str / Path): The directory path. + RETURNS (Lookups): The loaded lookups. + + DOCS: https://spacy.io/api/lookups#from_disk + """ + path = ensure_path(path) + filepath = path / filename + if filepath.exists(): + with filepath.open("rb") as file_: + data = file_.read() + return self.from_bytes(data) + return self diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 3dec1bc70..4fe8f7428 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -27,12 +27,6 @@ cdef class Morphology: cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except * cdef int insert(self, MorphAnalysisC tag) except -1 - cdef int assign_untagged(self, TokenC* token) except -1 - cdef int assign_tag(self, TokenC* token, tag) except -1 - cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - - cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1 - cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil cdef list list_features(const MorphAnalysisC* morph) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b2ba32a59..fcfe216ba 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -31,43 +31,15 @@ cdef class Morphology: VALUE_SEP = "," EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0 - def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): + def __init__(self, StringStore strings): self.mem = Pool() self.strings = strings self.tags = PreshMap() - self.load_tag_map(tag_map) - self.lemmatizer = lemmatizer - - self._cache = PreshMapArray(self.n_tags) - self._exc = {} - if exc is not None: - self.load_morph_exceptions(exc) - - def load_tag_map(self, tag_map): - self.tag_map = {} - self.reverse_index = {} - # Add special space symbol. We prefix with underscore, to make sure it - # always sorts to the end. - if '_SP' in tag_map: - space_attrs = tag_map.get('_SP') - else: - space_attrs = tag_map.get('SP', {POS: SPACE}) - if '_SP' not in tag_map: - self.strings.add('_SP') - tag_map = dict(tag_map) - tag_map['_SP'] = space_attrs - for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - attrs = self.normalize_attrs(attrs) - self.add(attrs) - self.tag_map[tag_str] = dict(attrs) - self.reverse_index[self.strings.add(tag_str)] = i - self.tag_names = tuple(sorted(self.tag_map.keys())) - self.n_tags = len(self.tag_map) - self._cache = PreshMapArray(self.n_tags) def __reduce__(self): - return (Morphology, (self.strings, self.tag_map, self.lemmatizer, - self.exc), None, None) + tags = set([self.get(self.strings[s]) for s in self.strings]) + tags -= set([""]) + return (unpickle_morphology, (self.strings, sorted(tags)), None, None) def add(self, features): """Insert a morphological analysis in the morphology table, if not @@ -185,115 +157,6 @@ cdef class Morphology: else: return self.strings[tag.key] - def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): - if orth not in self.strings: - return orth - cdef unicode py_string = self.strings[orth] - if self.lemmatizer is None: - return self.strings.add(py_string.lower()) - cdef list lemma_strings - cdef unicode lemma_string - # Normalize features into a dict keyed by the field, to make life easier - # for the lemmatizer. Handles string-to-int conversion too. - string_feats = {} - for key, value in morphology.items(): - if value is True: - name, value = self.strings.as_string(key).split('_', 1) - string_feats[name] = value - else: - string_feats[self.strings.as_string(key)] = self.strings.as_string(value) - lemma_strings = self.lemmatizer(py_string, univ_pos, string_feats) - lemma_string = lemma_strings[0] - lemma = self.strings.add(lemma_string) - return lemma - - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, - force=False): - """Add a special-case rule to the morphological analyser. Tokens whose - tag and orth match the rule will receive the specified properties. - - tag (str): The part-of-speech tag to key the exception. - orth (str): The word-form to key the exception. - """ - attrs = dict(attrs) - attrs = self.normalize_attrs(attrs) - self.add(attrs) - attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - self._exc[(tag_str, self.strings.add(orth_str))] = attrs - - cdef int assign_untagged(self, TokenC* token) except -1: - """Set morphological attributes on a token without a POS tag. Uses - the lemmatizer's lookup() method, which looks up the string in the - table provided by the language data as lemma_lookup (if available). - """ - if token.lemma == 0: - orth_str = self.strings[token.lex.orth] - lemma = self.lemmatizer.lookup(orth_str, orth=token.lex.orth) - token.lemma = self.strings.add(lemma) - - cdef int assign_tag(self, TokenC* token, tag_str) except -1: - cdef attr_t tag = self.strings.as_int(tag_str) - if tag in self.reverse_index: - tag_id = self.reverse_index[tag] - self.assign_tag_id(token, tag_id) - else: - token.tag = tag - - cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: - if tag_id > self.n_tags: - raise ValueError(Errors.E014.format(tag=tag_id)) - # Ensure spaces get tagged as space. - # It seems pretty arbitrary to put this logic here, but there's really - # nowhere better. I guess the justification is that this is where the - # specific word and the tag interact. Still, we should have a better - # way to enforce this rule, or figure out why the statistical model fails. - # Related to Issue #220 - if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('_SP')] - tag_str = self.tag_names[tag_id] - features = dict(self.tag_map.get(tag_str, {})) - if features: - pos = self.strings.as_int(features.pop(POS)) - else: - pos = 0 - cdef attr_t lemma = self._cache.get(tag_id, token.lex.orth) - if lemma == 0: - # Ugh, self.lemmatize has opposite arg order from self.lemmatizer :( - lemma = self.lemmatize(pos, token.lex.orth, features) - self._cache.set(tag_id, token.lex.orth, lemma) - token.lemma = lemma - token.pos = pos - token.tag = self.strings[tag_str] - token.morph = self.add(features) - if (self.tag_names[tag_id], token.lex.orth) in self._exc: - self._assign_tag_from_exceptions(token, tag_id) - - cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1: - key = (self.tag_names[tag_id], token.lex.orth) - cdef dict attrs - attrs = self._exc[key] - token.pos = attrs.get(POS, token.pos) - token.lemma = attrs.get(LEMMA, token.lemma) - - def load_morph_exceptions(self, dict morph_rules): - self._exc = {} - # Map (form, pos) to attributes - for tag, exc in morph_rules.items(): - for orth, attrs in exc.items(): - attrs = self.normalize_attrs(attrs) - self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs) - - @property - def exc(self): - # generate the serializable exc in the MORPH_RULES format from the - # internal tuple-key format - morph_rules = {} - for (tag, orth) in sorted(self._exc): - if not tag in morph_rules: - morph_rules[tag] = {} - morph_rules[tag][self.strings[orth]] = self._exc[(tag, orth)] - return morph_rules - @staticmethod def feats_to_dict(feats): if not feats or feats == Morphology.EMPTY_MORPH: @@ -338,3 +201,9 @@ cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t fie results[n_results] = morph.features[i] n_results += 1 return n_results + +def unpickle_morphology(strings, tags): + cdef Morphology morphology = Morphology(strings) + for tag in tags: + morphology.add(tag) + return morphology diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 7f395b5f2..793aa83c3 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,9 +3,10 @@ from .dep_parser import DependencyParser from .entity_linker import EntityLinker from .ner import EntityRecognizer from .entityruler import EntityRuler +from .lemmatizer import Lemmatizer from .morphologizer import Morphologizer from .pipe import Pipe -from spacy.pipeline.senter import SentenceRecognizer +from .senter import SentenceRecognizer from .sentencizer import Sentencizer from .simple_ner import SimpleNER from .tagger import Tagger @@ -20,6 +21,7 @@ __all__ = [ "EntityRecognizer", "EntityRuler", "Morphologizer", + "Lemmatizer", "Pipe", "SentenceRecognizer", "Sentencizer", diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py new file mode 100644 index 000000000..f2028772f --- /dev/null +++ b/spacy/pipeline/lemmatizer.py @@ -0,0 +1,330 @@ +from typing import Optional, List, Dict, Any + +from thinc.api import Model + +from .pipe import Pipe +from ..errors import Errors +from ..language import Language +from ..lookups import Lookups, load_lookups +from ..scorer import Scorer +from ..tokens import Doc, Token +from ..vocab import Vocab +from .. import util + + +@Language.factory( + "lemmatizer", + assigns=["token.lemma"], + default_config={ + "model": None, + "mode": "lookup", + "lookups": None, + "overwrite": False, + }, + scores=["lemma_acc"], + default_score_weights={"lemma_acc": 1.0}, +) +def make_lemmatizer( + nlp: Language, + model: Optional[Model], + name: str, + mode: str, + lookups: Optional[Lookups], + overwrite: bool = False, +): + lookups = Lemmatizer.load_lookups(nlp.lang, mode, lookups) + return Lemmatizer( + nlp.vocab, model, name, mode=mode, lookups=lookups, overwrite=overwrite + ) + + +class Lemmatizer(Pipe): + """ + The Lemmatizer supports simple part-of-speech-sensitive suffix rules and + lookup tables. + + DOCS: https://spacy.io/api/lemmatizer + """ + + @classmethod + def get_lookups_config(cls, mode: str) -> Dict: + """Returns the lookups configuration settings for a given mode for use + in Lemmatizer.load_lookups. + + mode (str): The lemmatizer mode. + RETURNS (dict): The lookups configuration settings for this mode. + + DOCS: https://spacy.io/api/lemmatizer#get_lookups_config + """ + if mode == "lookup": + return { + "required_tables": ["lemma_lookup"], + } + elif mode == "rule": + return { + "required_tables": ["lemma_rules"], + "optional_tables": ["lemma_exc", "lemma_index"], + } + return {} + + @classmethod + def load_lookups(cls, lang: str, mode: str, lookups: Optional[Lookups],) -> Lookups: + """Load and validate lookups tables. If the provided lookups is None, + load the default lookups tables according to the language and mode + settings. Confirm that all required tables for the language and mode + are present. + + lang (str): The language code. + mode (str): The lemmatizer mode. + lookups (Lookups): The provided lookups, may be None if the default + lookups should be loaded. + RETURNS (Lookups): The Lookups object. + + DOCS: https://spacy.io/api/lemmatizer#get_lookups_config + """ + config = cls.get_lookups_config(mode) + required_tables = config.get("required_tables", []) + optional_tables = config.get("optional_tables", []) + if lookups is None: + lookups = load_lookups(lang=lang, tables=required_tables) + optional_lookups = load_lookups( + lang=lang, tables=optional_tables, strict=False + ) + for table in optional_lookups.tables: + lookups.set_table(table, optional_lookups.get_table(table)) + for table in required_tables: + if table not in lookups: + raise ValueError( + Errors.E1004.format( + mode=mode, tables=required_tables, found=lookups.tables + ) + ) + return lookups + + def __init__( + self, + vocab: Vocab, + model: Optional[Model], + name: str = "lemmatizer", + *, + mode: str = "lookup", + lookups: Optional[Lookups] = None, + overwrite: bool = False, + ) -> None: + """Initialize a Lemmatizer. + + vocab (Vocab): The vocab. + model (Model): A model (not yet implemented). + name (str): The component name. Defaults to "lemmatizer". + mode (str): The lemmatizer mode: "lookup", "rule". Defaults to "lookup". + lookups (Lookups): The lookups object containing the (optional) tables + such as "lemma_rules", "lemma_index", "lemma_exc" and + "lemma_lookup". Defaults to None + overwrite (bool): Whether to overwrite existing lemmas. Defaults to + `False`. + + DOCS: https://spacy.io/api/lemmatizer#init + """ + self.vocab = vocab + self.model = model + self._mode = mode + self.lookups = lookups if lookups is not None else Lookups() + self.overwrite = overwrite + if self.mode == "lookup": + self.lemmatize = self.lookup_lemmatize + elif self.mode == "rule": + self.lemmatize = self.rule_lemmatize + else: + try: + self.lemmatize = getattr(self, f"{self.mode}_lemmatize") + except AttributeError: + raise ValueError(Errors.E1003.format(mode=mode)) + self.cache = {} + + @property + def mode(self): + return self._mode + + def __call__(self, doc: Doc) -> Doc: + """Apply the lemmatizer to one document. + + doc (Doc): The Doc to process. + RETURNS (Doc): The processed Doc. + + DOCS: https://spacy.io/api/lemmatizer#call + """ + for token in doc: + if self.overwrite or token.lemma == 0: + token.lemma_ = self.lemmatize(token)[0] + return doc + + def pipe(self, stream, *, batch_size=128): + """Apply the pipe to a stream of documents. This usually happens under + the hood when the nlp object is called on a text and all components are + applied to the Doc. + + stream (Iterable[Doc]): A stream of documents. + batch_size (int): The number of documents to buffer. + YIELDS (Doc): Processed documents in order. + + DOCS: https://spacy.io/api/lemmatizer#pipe + """ + for doc in stream: + doc = self(doc) + yield doc + + def lookup_lemmatize(self, token: Token) -> List[str]: + """Lemmatize using a lookup-based approach. + + token (Token): The token to lemmatize. + RETURNS (list): The available lemmas for the string. + + DOCS: https://spacy.io/api/lemmatizer#lookup_lemmatize + """ + lookup_table = self.lookups.get_table("lemma_lookup", {}) + result = lookup_table.get(token.text, token.text) + if isinstance(result, str): + result = [result] + return result + + def rule_lemmatize(self, token: Token) -> List[str]: + """Lemmatize using a rule-based approach. + + token (Token): The token to lemmatize. + RETURNS (list): The available lemmas for the string. + + DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize + """ + cache_key = (token.orth, token.pos, token.morph) + if cache_key in self.cache: + return self.cache[cache_key] + string = token.text + univ_pos = token.pos_.lower() + if univ_pos in ("", "eol", "space"): + return [string.lower()] + # See Issue #435 for example of where this logic is requied. + if self.is_base_form(token): + return [string.lower()] + index_table = self.lookups.get_table("lemma_index", {}) + exc_table = self.lookups.get_table("lemma_exc", {}) + rules_table = self.lookups.get_table("lemma_rules", {}) + if not any( + ( + index_table.get(univ_pos), + exc_table.get(univ_pos), + rules_table.get(univ_pos), + ) + ): + if univ_pos == "propn": + return [string] + else: + return [string.lower()] + + index = index_table.get(univ_pos, {}) + exceptions = exc_table.get(univ_pos, {}) + rules = rules_table.get(univ_pos, {}) + orig = string + string = string.lower() + forms = [] + oov_forms = [] + for old, new in rules: + if string.endswith(old): + form = string[: len(string) - len(old)] + new + if not form: + pass + elif form in index or not form.isalpha(): + forms.append(form) + else: + oov_forms.append(form) + # Remove duplicates but preserve the ordering of applied "rules" + forms = list(dict.fromkeys(forms)) + # Put exceptions at the front of the list, so they get priority. + # This is a dodgy heuristic -- but it's the best we can do until we get + # frequencies on this. We can at least prune out problematic exceptions, + # if they shadow more frequent analyses. + for form in exceptions.get(string, []): + if form not in forms: + forms.insert(0, form) + if not forms: + forms.extend(oov_forms) + if not forms: + forms.append(orig) + self.cache[cache_key] = forms + return forms + + def is_base_form(self, token: Token) -> bool: + """Check whether the token is a base form that does not need further + analysis for lemmatization. + + token (Token): The token. + RETURNS (bool): Whether the token is a base form. + + DOCS: https://spacy.io/api/lemmatizer#is_base_form + """ + return False + + def score(self, examples, **kwargs) -> Dict[str, Any]: + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores. + + DOCS: https://spacy.io/api/lemmatizer#score + """ + return Scorer.score_token_attr(examples, "lemma", **kwargs) + + def to_disk(self, path, *, exclude=tuple()): + """Save the current state to a directory. + + path (unicode or Path): A path to a directory, which will be created if + it doesn't exist. + exclude (list): String names of serialization fields to exclude. + + DOCS: https://spacy.io/api/vocab#to_disk + """ + serialize = {} + serialize["vocab"] = lambda p: self.vocab.to_disk(p) + serialize["lookups"] = lambda p: self.lookups.to_disk(p) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, *, exclude=tuple()): + """Loads state from a directory. Modifies the object in place and + returns it. + + path (unicode or Path): A path to a directory. + exclude (list): String names of serialization fields to exclude. + RETURNS (Vocab): The modified `Vocab` object. + + DOCS: https://spacy.io/api/vocab#to_disk + """ + deserialize = {} + deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["lookups"] = lambda p: self.lookups.from_disk(p) + util.from_disk(path, deserialize, exclude) + + def to_bytes(self, *, exclude=tuple()) -> bytes: + """Serialize the current state to a binary string. + + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): The serialized form of the `Vocab` object. + + DOCS: https://spacy.io/api/vocab#to_bytes + """ + serialize = {} + serialize["vocab"] = self.vocab.to_bytes + serialize["lookups"] = self.lookups.to_bytes + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data: bytes, *, exclude=tuple()): + """Load state from a binary string. + + bytes_data (bytes): The data to load from. + exclude (list): String names of serialization fields to exclude. + RETURNS (Vocab): The `Vocab` object. + + DOCS: https://spacy.io/api/vocab#from_bytes + """ + deserialize = {} + deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) + deserialize["lookups"] = lambda b: self.lookups.from_bytes(b) + util.from_bytes(bytes_data, deserialize, exclude) diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index da1b3d3aa..f1515889b 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -39,12 +39,12 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"] @Language.factory( "tagger", assigns=["token.tag"], - default_config={"model": DEFAULT_TAGGER_MODEL, "set_morphology": False}, - scores=["tag_acc", "pos_acc", "lemma_acc"], + default_config={"model": DEFAULT_TAGGER_MODEL}, + scores=["tag_acc"], default_score_weights={"tag_acc": 1.0}, ) -def make_tagger(nlp: Language, name: str, model: Model, set_morphology: bool): - return Tagger(nlp.vocab, model, name, set_morphology=set_morphology) +def make_tagger(nlp: Language, name: str, model: Model): + return Tagger(nlp.vocab, model, name) class Tagger(Pipe): @@ -52,13 +52,14 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger """ - def __init__(self, vocab, model, name="tagger", *, set_morphology=False): + def __init__(self, vocab, model, name="tagger", *, labels=None): """Initialize a part-of-speech tagger. vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. name (str): The component instance name, used to add entries to the losses during training. + labels (List): The set of labels. Defaults to None. set_morphology (bool): Whether to set morphological features. DOCS: https://spacy.io/api/tagger#init @@ -67,7 +68,7 @@ class Tagger(Pipe): self.model = model self.name = name self._rehearsal_model = None - cfg = {"set_morphology": set_morphology} + cfg = {"labels": labels or []} self.cfg = dict(sorted(cfg.items())) @property @@ -80,7 +81,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#labels """ - return tuple(self.vocab.morphology.tag_names) + return tuple(self.cfg["labels"]) def __call__(self, doc): """Apply the pipe to a Doc. @@ -150,9 +151,7 @@ class Tagger(Pipe): if isinstance(docs, Doc): docs = [docs] cdef Doc doc - cdef int idx = 0 cdef Vocab vocab = self.vocab - assign_morphology = self.cfg.get("set_morphology", True) for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] if hasattr(doc_tag_ids, "get"): @@ -160,15 +159,7 @@ class Tagger(Pipe): for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0: - if doc.c[j].pos == 0 and assign_morphology: - # Don't clobber preset lemmas - lemma = doc.c[j].lemma - vocab.morphology.assign_tag_id(&doc.c[j], tag_id) - if lemma != 0 and lemma != doc.c[j].lex.orth: - doc.c[j].lemma = lemma - else: - doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] - idx += 1 + doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] doc.is_tagged = True def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): @@ -279,55 +270,26 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#begin_training """ - lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - if not any(table in self.vocab.lookups for table in lemma_tables): - warnings.warn(Warnings.W022) - lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) - if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: - langs = ", ".join(util.LEXEME_NORM_LANGS) - warnings.warn(Warnings.W033.format(model="part-of-speech tagger", langs=langs)) - orig_tag_map = dict(self.vocab.morphology.tag_map) - new_tag_map = {} + tags = set() for example in get_examples(): try: y = example.y except AttributeError: raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None for token in y: - tag = token.tag_ - if tag in orig_tag_map: - new_tag_map[tag] = orig_tag_map[tag] - else: - new_tag_map[tag] = {POS: X} - - cdef Vocab vocab = self.vocab - if new_tag_map: - if "_SP" in orig_tag_map: - new_tag_map["_SP"] = orig_tag_map["_SP"] - vocab.morphology.load_tag_map(new_tag_map) + tags.add(token.tag_) + for tag in sorted(tags): + self.add_label(tag) self.set_output(len(self.labels)) - doc_sample = [Doc(self.vocab, words=["hello", "world"])] - if pipeline is not None: - for name, component in pipeline: - if component is self: - break - if hasattr(component, "pipe"): - doc_sample = list(component.pipe(doc_sample)) - else: - doc_sample = [component(doc) for doc in doc_sample] - self.model.initialize(X=doc_sample) - # Get batch of example docs, example outputs to call begin_training(). - # This lets the model infer shapes. + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd - def add_label(self, label, values=None): + def add_label(self, label): """Add a new label to the pipe. label (str): The label to add. - values (Dict[int, str]): Optional values to map to the label, e.g. a - tag map dictionary. RETURNS (int): 0 if label is already present, otherwise 1. DOCS: https://spacy.io/api/tagger#add_label @@ -336,22 +298,8 @@ class Tagger(Pipe): raise ValueError(Errors.E187) if label in self.labels: return 0 - if self.model.has_dim("nO"): - # Here's how the model resizing will work, once the - # neuron-to-tag mapping is no longer controlled by - # the Morphology class, which sorts the tag names. - # The sorting makes adding labels difficult. - # smaller = self.model._layers[-1] - # larger = Softmax(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger - raise ValueError(TempErrors.T003) - tag_map = dict(self.vocab.morphology.tag_map) - if values is None: - values = {POS: "X"} - tag_map[label] = values - self.vocab.morphology.load_tag_map(tag_map) + self.cfg["labels"].append(label) + self.vocab.strings.add(label) return 1 def score(self, examples, **kwargs): @@ -363,11 +311,7 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#score """ - scores = {} - scores.update(Scorer.score_token_attr(examples, "tag", **kwargs)) - scores.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) - return scores + return Scorer.score_token_attr(examples, "tag", **kwargs) def to_bytes(self, *, exclude=tuple()): """Serialize the pipe to a bytestring. @@ -381,10 +325,6 @@ class Tagger(Pipe): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) - serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) - morph_rules = dict(self.vocab.morphology.exc) - serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, *, exclude=tuple()): @@ -402,21 +342,8 @@ class Tagger(Pipe): except AttributeError: raise ValueError(Errors.E149) from None - def load_tag_map(b): - tag_map = srsly.msgpack_loads(b) - self.vocab.morphology.load_tag_map(tag_map) - - def load_morph_rules(b): - morph_rules = srsly.msgpack_loads(b) - self.vocab.morphology.load_morph_exceptions(morph_rules) - - self.vocab.morphology = Morphology(self.vocab.strings, dict(), - lemmatizer=self.vocab.morphology.lemmatizer) - deserialize = { "vocab": lambda b: self.vocab.from_bytes(b), - "tag_map": load_tag_map, - "morph_rules": load_morph_rules, "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: load_model(b), } @@ -431,12 +358,8 @@ class Tagger(Pipe): DOCS: https://spacy.io/api/tagger#to_disk """ - tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) - morph_rules = dict(self.vocab.morphology.exc) serialize = { "vocab": lambda p: self.vocab.to_disk(p), - "tag_map": lambda p: srsly.write_msgpack(p, tag_map), - "morph_rules": lambda p: srsly.write_msgpack(p, morph_rules), "model": lambda p: self.model.to_disk(p), "cfg": lambda p: srsly.write_json(p, self.cfg), } @@ -458,22 +381,9 @@ class Tagger(Pipe): except AttributeError: raise ValueError(Errors.E149) from None - def load_tag_map(p): - tag_map = srsly.read_msgpack(p) - self.vocab.morphology.load_tag_map(tag_map) - - def load_morph_rules(p): - morph_rules = srsly.read_msgpack(p) - self.vocab.morphology.load_morph_exceptions(morph_rules) - - self.vocab.morphology = Morphology(self.vocab.strings, dict(), - lemmatizer=self.vocab.morphology.lemmatizer) - deserialize = { "vocab": lambda p: self.vocab.from_disk(p), "cfg": lambda p: self.cfg.update(deserialize_config(p)), - "tag_map": load_tag_map, - "morph_rules": load_morph_rules, "model": load_model, } util.from_disk(path, deserialize, exclude) diff --git a/spacy/schemas.py b/spacy/schemas.py index d599ccbb2..0f2a35c60 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -220,7 +220,6 @@ class ConfigSchemaNlp(BaseModel): lang: StrictStr = Field(..., title="The base language to use") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") tokenizer: Callable = Field(..., title="The tokenizer to use") - lemmatizer: Callable = Field(..., title="The lemmatizer to use") load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index cfdb8e4ff..1c0595672 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -201,7 +201,7 @@ def ru_tokenizer(): @pytest.fixture def ru_lemmatizer(): pytest.importorskip("pymorphy2") - return get_lang_class("ru")().vocab.morphology.lemmatizer + return get_lang_class("ru")().add_pipe("lemmatizer") @pytest.fixture(scope="session") diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 3ee833aa8..0dc6c4866 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -1,21 +1,12 @@ import pytest from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Lookups from spacy import util @pytest.fixture -def lemmatizer(): - lookups = Lookups() - lookups.add_table("lemma_lookup", {"dogs": "dog", "boxen": "box", "mice": "mouse"}) - return Lemmatizer(lookups) - - -@pytest.fixture -def vocab(lemmatizer): - return Vocab(lemmatizer=lemmatizer) +def vocab(): + return Vocab() def test_empty_doc(vocab): @@ -30,14 +21,6 @@ def test_single_word(vocab): assert doc.text == "a" -def test_lookup_lemmatization(vocab): - doc = Doc(vocab, words=["dogs", "dogses"]) - assert doc[0].text == "dogs" - assert doc[0].lemma_ == "dog" - assert doc[1].text == "dogses" - assert doc[1].lemma_ == "dogses" - - def test_create_from_words_and_text(vocab): # no whitespace in words words = ["'", "dogs", "'", "run"] diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 88557d100..6bfc198fd 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -1,23 +1,17 @@ import pytest -from spacy.symbols import POS, PRON, VERB @pytest.fixture def i_has(en_tokenizer): doc = en_tokenizer("I has") - tag_map = { - "PRP": {POS: PRON, "PronType": "prs"}, - "VBZ": { - POS: VERB, - "VerbForm": "fin", - "Tense": "pres", - "Number": "sing", - "Person": "three", - }, + doc[0].morph_ = {"PronType": "prs"} + doc[1].morph_ = { + "VerbForm": "fin", + "Tense": "pres", + "Number": "sing", + "Person": "three", } - en_tokenizer.vocab.morphology.load_tag_map(tag_map) - doc[0].tag_ = "PRP" - doc[1].tag_ = "VBZ" + return doc diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index e941b48ed..bc9567b2a 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -124,7 +124,6 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer): assert doc[0].text == "The players" assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" - assert doc[0].lemma_ == "The players" doc = get_doc( tokens.vocab, words=[t.text for t in tokens], @@ -143,11 +142,9 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_tokenizer): assert doc[0].text == "The players" assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" - assert doc[0].lemma_ == "The players" assert doc[1].text == "start ." assert doc[1].tag_ == "VBZ" assert doc[1].pos_ == "VERB" - assert doc[1].lemma_ == "start ." def test_doc_retokenize_spans_merge_heads(en_tokenizer): diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py deleted file mode 100644 index c4dc18bba..000000000 --- a/spacy/tests/lang/en/test_tagger.py +++ /dev/null @@ -1,21 +0,0 @@ -from spacy.symbols import POS, PRON, VERB, DET, NOUN, PUNCT -from ...util import get_doc - - -def test_en_tagger_load_morph_exc(en_tokenizer): - text = "I like his style." - tags = ["PRP", "VBP", "PRP$", "NN", "."] - tag_map = { - "PRP": {POS: PRON}, - "VBP": {POS: VERB}, - "PRP$": {POS: DET}, - "NN": {POS: NOUN}, - ".": {POS: PUNCT}, - } - morph_exc = {"VBP": {"like": {"lemma": "luck"}}} - en_tokenizer.vocab.morphology.load_tag_map(tag_map) - en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc) - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags) - assert doc[1].tag_ == "VBP" - assert doc[1].lemma_ == "luck" diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 8a87a7506..bcf103b65 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -3,15 +3,16 @@ import pytest from ...util import get_doc -@pytest.mark.xfail(reason="TODO: investigate why lemmatizer fails here") -def test_ru_doc_lemmatization(ru_tokenizer): +def test_ru_doc_lemmatization(ru_lemmatizer): words = ["мама", "мыла", "раму"] - tags = [ - "NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", - "VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", - "NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", + pos = ["NOUN", "VERB", "NOUN"] + morphs = [ + "Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", + "Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", ] - doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags) + doc = get_doc(ru_lemmatizer.vocab, words=words, pos=pos, morphs=morphs) + doc = ru_lemmatizer(doc) lemmas = [token.lemma_ for token in doc] assert lemmas == ["мама", "мыть", "рама"] @@ -27,43 +28,51 @@ def test_ru_doc_lemmatization(ru_tokenizer): ], ) def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas): - assert sorted(ru_lemmatizer.noun(text)) == lemmas + doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"]) + result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0]) + assert sorted(result_lemmas) == lemmas @pytest.mark.parametrize( - "text,pos,morphology,lemma", + "text,pos,morph,lemma", [ - ("рой", "NOUN", None, "рой"), - ("рой", "VERB", None, "рыть"), - ("клей", "NOUN", None, "клей"), - ("клей", "VERB", None, "клеить"), - ("три", "NUM", None, "три"), - ("кос", "NOUN", {"Number": "Sing"}, "кос"), - ("кос", "NOUN", {"Number": "Plur"}, "коса"), - ("кос", "ADJ", None, "косой"), - ("потом", "NOUN", None, "пот"), - ("потом", "ADV", None, "потом"), + ("рой", "NOUN", "", "рой"), + ("рой", "VERB", "", "рыть"), + ("клей", "NOUN", "", "клей"), + ("клей", "VERB", "", "клеить"), + ("три", "NUM", "", "три"), + ("кос", "NOUN", "Number=Sing", "кос"), + ("кос", "NOUN", "Number=Plur", "коса"), + ("кос", "ADJ", "", "косой"), + ("потом", "NOUN", "", "пот"), + ("потом", "ADV", "", "потом"), ], ) def test_ru_lemmatizer_works_with_different_pos_homonyms( - ru_lemmatizer, text, pos, morphology, lemma + ru_lemmatizer, text, pos, morph, lemma ): - assert ru_lemmatizer(text, pos, morphology) == [lemma] + doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=[pos], morphs=[morph]) + result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0]) + assert result_lemmas == [lemma] @pytest.mark.parametrize( - "text,morphology,lemma", + "text,morph,lemma", [ - ("гвоздики", {"Gender": "Fem"}, "гвоздика"), - ("гвоздики", {"Gender": "Masc"}, "гвоздик"), - ("вина", {"Gender": "Fem"}, "вина"), - ("вина", {"Gender": "Neut"}, "вино"), + ("гвоздики", "Gender=Fem", "гвоздика"), + ("гвоздики", "Gender=Masc", "гвоздик"), + ("вина", "Gender=Fem", "вина"), + ("вина", "Gender=Neut", "вино"), ], ) -def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma): - assert ru_lemmatizer.noun(text, morphology) == [lemma] +def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma): + doc = get_doc(ru_lemmatizer.vocab, words=[text], pos=["NOUN"], morphs=[morph]) + result_lemmas = ru_lemmatizer.pymorphy2_lemmatize(doc[0]) + assert result_lemmas == [lemma] def test_ru_lemmatizer_punct(ru_lemmatizer): - assert ru_lemmatizer.punct("«") == ['"'] - assert ru_lemmatizer.punct("»") == ['"'] + doc = get_doc(ru_lemmatizer.vocab, words=["«"], pos=["PUNCT"]) + assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] + doc = get_doc(ru_lemmatizer.vocab, words=["»"], pos=["PUNCT"]) + assert ru_lemmatizer.pymorphy2_lemmatize(doc[0]) == ['"'] diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py new file mode 100644 index 000000000..8c235c86e --- /dev/null +++ b/spacy/tests/lang/test_lemmatizers.py @@ -0,0 +1,34 @@ +import pytest +from spacy import registry +from spacy.lookups import Lookups +from spacy.util import get_lang_class + + +# fmt: off +# Only include languages with no external dependencies +# excluded: ru, uk +# excluded for custom tables: pl +LANGUAGES = ["el", "en", "fr", "nl"] +# fmt: on + + +@pytest.mark.parametrize("lang", LANGUAGES) +def test_lemmatizer_initialize(lang, capfd): + @registry.assets("lemmatizer_init_lookups") + def lemmatizer_init_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + """Test that languages can be initialized.""" + nlp = get_lang_class(lang)() + nlp.add_pipe( + "lemmatizer", config={"lookups": {"@assets": "lemmatizer_init_lookups"}} + ) + # Check for stray print statements (see #3342) + doc = nlp("test") # noqa: F841 + captured = capfd.readouterr() + assert not captured.out diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index f644a5867..0693da690 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,14 +1,11 @@ import pytest from spacy.morphology import Morphology from spacy.strings import StringStore, get_string_id -from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Lookups @pytest.fixture def morphology(): - lemmatizer = Lemmatizer(Lookups()) - return Morphology(StringStore(), {}, lemmatizer) + return Morphology(StringStore()) def test_init(morphology): diff --git a/spacy/tests/morphology/test_morph_pickle.py b/spacy/tests/morphology/test_morph_pickle.py index 2c374e11f..0758a6c01 100644 --- a/spacy/tests/morphology/test_morph_pickle.py +++ b/spacy/tests/morphology/test_morph_pickle.py @@ -2,21 +2,18 @@ import pytest import pickle from spacy.morphology import Morphology from spacy.strings import StringStore -from spacy.lemmatizer import Lemmatizer -from spacy.lookups import Lookups @pytest.fixture def morphology(): - tag_map = {"A": {"POS": "X"}, "B": {"POS": "NOUN"}} - exc = {"A": {"a": {"POS": "VERB"}}} - lemmatizer = Lemmatizer(Lookups()) - return Morphology(StringStore(), tag_map, lemmatizer, exc=exc) + morphology = Morphology(StringStore()) + morphology.add("Feat1=Val1|Feat2=Val2") + morphology.add("Feat3=Val3|Feat4=Val4") + return morphology def test_morphology_pickle_roundtrip(morphology): b = pickle.dumps(morphology) reloaded_morphology = pickle.loads(b) - - assert morphology.tag_map == reloaded_morphology.tag_map - assert morphology.exc == reloaded_morphology.exc + assert reloaded_morphology.get(morphology.strings["Feat1=Val1|Feat2=Val2"]) == "Feat1=Val1|Feat2=Val2" + assert reloaded_morphology.get(morphology.strings["Feat3=Val3|Feat4=Val4"]) == "Feat3=Val3|Feat4=Val4" diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 45ae09702..8265a8a45 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -82,10 +82,10 @@ def test_parser_merge_pp(en_tokenizer): text = "A phrase with another phrase occurs" heads = [1, 4, -1, 1, -2, 0] deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"] - tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"] + pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"] tokens = en_tokenizer(text) doc = get_doc( - tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags + tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos, ) with doc.retokenize() as retokenizer: for np in doc.noun_chunks: diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py new file mode 100644 index 000000000..644fa0f01 --- /dev/null +++ b/spacy/tests/pipeline/test_lemmatizer.py @@ -0,0 +1,109 @@ +import pytest + +from spacy import util, registry +from spacy.lang.en import English +from spacy.lookups import Lookups, load_lookups + +from ..util import make_tempdir + + +@pytest.fixture +def nlp(): + return English() + + +@pytest.fixture +def lemmatizer(nlp): + @registry.assets("cope_lookups") + def cope_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + lemmatizer = nlp.add_pipe( + "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}} + ) + return lemmatizer + + +def test_lemmatizer_init(nlp): + @registry.assets("cope_lookups") + def cope_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + lemmatizer = nlp.add_pipe( + "lemmatizer", config={"mode": "lookup", "lookups": {"@assets": "cope_lookups"}} + ) + assert isinstance(lemmatizer.lookups, Lookups) + assert lemmatizer.mode == "lookup" + # replace any tables from spacy-lookups-data + lemmatizer.lookups = Lookups() + doc = nlp("coping") + # lookup with no tables sets text as lemma + assert doc[0].lemma_ == "coping" + + nlp.remove_pipe("lemmatizer") + + @registry.assets("empty_lookups") + def empty_lookups(): + return Lookups() + + with pytest.raises(ValueError): + nlp.add_pipe( + "lemmatizer", + config={"mode": "lookup", "lookups": {"@assets": "empty_lookups"}}, + ) + + +def test_lemmatizer_config(nlp, lemmatizer): + doc = nlp.make_doc("coping") + doc[0].pos_ = "VERB" + assert doc[0].lemma_ == "" + doc = lemmatizer(doc) + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + + doc = nlp.make_doc("coping") + doc[0].pos_ = "VERB" + assert doc[0].lemma_ == "" + doc = lemmatizer(doc) + assert doc[0].text == "coping" + assert doc[0].lemma_ == "cope" + + +def test_lemmatizer_serialize(nlp, lemmatizer): + @registry.assets("cope_lookups") + def cope_lookups(): + lookups = Lookups() + lookups.add_table("lemma_lookup", {"cope": "cope"}) + lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) + lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) + lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) + return lookups + + nlp2 = English() + lemmatizer2 = nlp2.add_pipe( + "lemmatizer", config={"mode": "rule", "lookups": {"@assets": "cope_lookups"}} + ) + lemmatizer2.from_bytes(lemmatizer.to_bytes()) + assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() + assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + doc2 = nlp2.make_doc("coping") + doc2[0].pos_ = "VERB" + assert doc2[0].lemma_ == "" + doc2 = lemmatizer(doc2) + assert doc2[0].text == "coping" + assert doc2[0].lemma_ == "cope" diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index dd6739e17..5f27a0afa 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -23,13 +23,12 @@ def test_tagger_begin_training_tag_map(): nlp = Language() tagger = nlp.add_pipe("tagger") orig_tag_count = len(tagger.labels) - tagger.add_label("A", {"POS": "NOUN"}) + tagger.add_label("A") nlp.begin_training() - assert nlp.vocab.morphology.tag_map["A"] == {POS: NOUN} assert orig_tag_count + 1 == len(nlp.get_pipe("tagger").labels) -TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} +TAGS = ("N", "V", "J") MORPH_RULES = {"V": {"like": {"lemma": "luck"}}} @@ -42,15 +41,12 @@ TRAIN_DATA = [ def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() - nlp.vocab.morphology.load_tag_map(TAG_MAP) - nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES) - tagger = nlp.add_pipe("tagger", config={"set_morphology": True}) - nlp.vocab.morphology.load_tag_map(TAG_MAP) + tagger = nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) + for tag in TAGS: + tagger.add_label(tag) optimizer = nlp.begin_training() for i in range(50): @@ -65,7 +61,6 @@ def test_overfitting_IO(): assert doc[1].tag_ is "V" assert doc[2].tag_ is "J" assert doc[3].tag_ is "N" - assert doc[1].lemma_ == "luck" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -76,4 +71,3 @@ def test_overfitting_IO(): assert doc2[1].tag_ is "V" assert doc2[2].tag_ is "J" assert doc2[3].tag_ is "N" - assert doc[1].lemma_ == "luck" diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index c1b83c6c4..b642ca229 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -8,10 +8,8 @@ from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.symbols import POS, VERB from spacy.vocab import Vocab from spacy.lang.en import English -from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.tokens import Doc, Span -from spacy.lang.en.lemmatizer import is_base_form from ..util import get_doc, make_tempdir @@ -157,16 +155,15 @@ def test_issue590(en_vocab): assert len(matches) == 2 +@pytest.mark.skip(reason="Old vocab-based lemmatization") def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] - tag_map = {"VB": {POS: VERB, "VerbForm": "inf"}} lookups = Lookups() lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) lookups.add_table("lemma_index", {"verb": {}}) lookups.add_table("lemma_exc", {"verb": {}}) - lemmatizer = Lemmatizer(lookups, is_base_form=is_base_form) - vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + vocab = Vocab() doc = Doc(vocab, words=words) doc[2].tag_ = "VB" assert doc[2].text == "feed" @@ -389,6 +386,7 @@ def test_issue891(en_tokenizer, text): assert tokens[1].text == "/" +@pytest.mark.skip(reason="Old vocab-based lemmatization") @pytest.mark.parametrize( "text,tag,lemma", [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")], diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index d612150de..0ac895546 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -6,7 +6,6 @@ from spacy.lang.en import English from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer -from spacy.lemmatizer import Lemmatizer from spacy.lookups import Lookups from spacy.symbols import ORTH, LEMMA, POS, VERB @@ -57,6 +56,7 @@ def test_issue1242(): assert len(docs[1]) == 1 +@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases") def test_issue1250(): """Test cached special cases.""" special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] @@ -87,20 +87,6 @@ def test_issue1375(): assert doc[1].nbor(1).text == "2" -def test_issue1387(): - tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}} - lookups = Lookups() - lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) - lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) - lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) - lemmatizer = Lemmatizer(lookups) - vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) - doc = Doc(vocab, words=["coping"]) - doc[0].tag_ = "VBG" - assert doc[0].text == "coping" - assert doc[0].lemma_ == "cope" - - def test_issue1434(): """Test matches occur when optional element at end of short doc.""" pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index b5d586ec6..83afb11f3 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -130,8 +130,6 @@ def test_issue1727(): vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = nlp.create_pipe("tagger") tagger.add_label("PRP") - with pytest.warns(UserWarning): - tagger.begin_training() assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index cf4e402e2..cf43e1a17 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -19,8 +19,8 @@ def test_issue2564(): """Test the tagger sets is_tagged correctly when used via Language.pipe.""" nlp = Language() tagger = nlp.add_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() # initialise weights + tagger.add_label("A") + tagger.begin_training() doc = nlp("hello world") assert doc.is_tagged docs = nlp.pipe(["hello", "world"]) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index e93c27a59..98a6b9aa0 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -241,11 +241,11 @@ def test_issue3449(): assert t3[5].text == "I" -@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3456(): # this crashed because of a padding error in layer.ops.unflatten in thinc nlp = English() - nlp.add_pipe("tagger") + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") nlp.begin_training() list(nlp.pipe(["hi", ""])) diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py index 6426c6c24..e42779ad7 100644 --- a/spacy/tests/regression/test_issue3501-4000.py +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -149,13 +149,15 @@ def test_issue3540(en_vocab): gold_text = ["I", "live", "in", "NewYork", "right", "now"] assert [token.text for token in doc] == gold_text gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + for i, lemma in enumerate(gold_lemma): + doc[i].lemma_ = lemma assert [token.lemma_ for token in doc] == gold_lemma vectors_1 = [token.vector for token in doc] assert len(vectors_1) == len(doc) with doc.retokenize() as retokenizer: heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} + attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]} retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) gold_text = ["I", "live", "in", "New", "York", "right", "now"] diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py index 27464a39a..e1d03eaf5 100644 --- a/spacy/tests/regression/test_issue4001-4500.py +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -271,6 +271,7 @@ def test_issue4267(): assert token.ent_iob == 2 +@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab") def test_issue4272(): """Test that lookup table can be accessed from Token.lemma if no POS tags are available.""" diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 31292b700..4c6504f6b 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -62,8 +62,7 @@ def tagger(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - with pytest.warns(UserWarning): - tagger.begin_training(pipeline=nlp.pipeline) + tagger.begin_training(pipeline=nlp.pipeline) return tagger diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 17d5a3a1e..7ba4815ee 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -44,8 +44,8 @@ def blank_parser(en_vocab): def taggers(en_vocab): cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - tagger1 = Tagger(en_vocab, model, set_morphology=True) - tagger2 = Tagger(en_vocab, model, set_morphology=True) + tagger1 = Tagger(en_vocab, model) + tagger2 = Tagger(en_vocab, model) return tagger1, tagger2 @@ -125,8 +125,8 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): tagger2.to_disk(file_path2) cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.make_from_config(cfg, validate=True)["model"] - tagger1_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path1) - tagger2_d = Tagger(en_vocab, model, set_morphology=True).from_disk(file_path2) + tagger1_d = Tagger(en_vocab, model).from_disk(file_path1) + tagger2_d = Tagger(en_vocab, model).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 44930247a..45a546203 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -8,7 +8,6 @@ from ..util import make_tempdir test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] test_strings_attrs = [(["rats", "are", "cute"], "Hello")] -default_strings = ("_SP", "POS=SPACE") @pytest.mark.parametrize("text", ["rat"]) @@ -34,10 +33,8 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert vocab1.to_bytes() == vocab1_b new_vocab1 = Vocab().from_bytes(vocab1_b) assert new_vocab1.to_bytes() == vocab1_b - assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE - assert sorted([s for s in new_vocab1.strings]) == sorted( - strings1 + list(default_strings) - ) + assert len(new_vocab1.strings) == len(strings1) + assert sorted([s for s in new_vocab1.strings]) == sorted(strings1) @pytest.mark.parametrize("strings1,strings2", test_strings) @@ -52,16 +49,12 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1_d = Vocab().from_disk(file_path1) vocab2_d = Vocab().from_disk(file_path2) # check strings rather than lexemes, which are only reloaded on demand - assert strings1 == [s for s in vocab1_d.strings if s not in default_strings] - assert strings2 == [s for s in vocab2_d.strings if s not in default_strings] + assert strings1 == [s for s in vocab1_d.strings] + assert strings2 == [s for s in vocab2_d.strings] if strings1 == strings2: - assert [s for s in vocab1_d.strings if s not in default_strings] == [ - s for s in vocab2_d.strings if s not in default_strings - ] + assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings] else: - assert [s for s in vocab1_d.strings if s not in default_strings] != [ - s for s in vocab2_d.strings if s not in default_strings - ] + assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings] @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) @@ -80,7 +73,7 @@ def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) vocab.from_bytes(vocab.to_bytes()) - assert len(vocab.strings) == len(strings) + 2 # adds _SP and POS=SPACE + assert len(vocab.strings) == len(strings) @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py deleted file mode 100644 index 3c904cb01..000000000 --- a/spacy/tests/test_lemmatizer.py +++ /dev/null @@ -1,64 +0,0 @@ -import pytest -from spacy.tokens import Doc -from spacy.language import Language -from spacy.lookups import Lookups -from spacy.lemmatizer import Lemmatizer - - -@pytest.mark.skip(reason="We probably don't want to support this anymore in v3?") -def test_lemmatizer_reflects_lookups_changes(): - """Test for an issue that'd cause lookups available in a model loaded from - disk to not be reflected in the lemmatizer.""" - nlp = Language() - assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "foo" - table = nlp.vocab.lookups.add_table("lemma_lookup") - table["foo"] = "bar" - assert Doc(nlp.vocab, words=["foo"])[0].lemma_ == "bar" - table = nlp.vocab.lookups.get_table("lemma_lookup") - table["hello"] = "world" - # The update to the table should be reflected in the lemmatizer - assert Doc(nlp.vocab, words=["hello"])[0].lemma_ == "world" - new_nlp = Language() - table = new_nlp.vocab.lookups.add_table("lemma_lookup") - table["hello"] = "hi" - assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "hi" - nlp_bytes = nlp.to_bytes() - new_nlp.from_bytes(nlp_bytes) - # Make sure we have the previously saved lookup table - assert "lemma_lookup" in new_nlp.vocab.lookups - assert len(new_nlp.vocab.lookups.get_table("lemma_lookup")) == 2 - assert new_nlp.vocab.lookups.get_table("lemma_lookup")["hello"] == "world" - assert Doc(new_nlp.vocab, words=["foo"])[0].lemma_ == "bar" - assert Doc(new_nlp.vocab, words=["hello"])[0].lemma_ == "world" - - -def test_tagger_warns_no_lookups(): - nlp = Language() - nlp.vocab.lookups = Lookups() - assert not len(nlp.vocab.lookups) - tagger = nlp.add_pipe("tagger") - with pytest.warns(UserWarning): - tagger.begin_training() - with pytest.warns(UserWarning): - nlp.begin_training() - nlp.vocab.lookups.add_table("lemma_lookup") - nlp.vocab.lookups.add_table("lexeme_norm") - nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A" - with pytest.warns(None) as record: - nlp.begin_training() - assert not record.list - - -def test_lemmatizer_without_is_base_form_implementation(): - # Norwegian example from #5658 - lookups = Lookups() - lookups.add_table("lemma_rules", {"noun": []}) - lookups.add_table("lemma_index", {"noun": {}}) - lookups.add_table("lemma_exc", {"noun": {"formuesskatten": ["formuesskatt"]}}) - - lemmatizer = Lemmatizer(lookups, is_base_form=None) - assert lemmatizer( - "Formuesskatten", - "noun", - {"Definite": "def", "Gender": "masc", "Number": "sing"}, - ) == ["formuesskatt"] diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index c035559b4..b89c0627f 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -112,16 +112,15 @@ def test_tokenizer_validate_special_case(tokenizer, text, tokens): @pytest.mark.parametrize( - "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])] + "text,tokens", [("lorem", [{"orth": "lo", "norm": "LO"}, {"orth": "rem"}])] ) def test_tokenizer_add_special_case_tag(text, tokens): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + vocab = Vocab() tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) assert doc[0].text == tokens[0]["orth"] - assert doc[0].tag_ == tokens[0]["tag"] - assert doc[0].pos_ == "NOUN" + assert doc[0].norm_ == tokens[0]["norm"] assert doc[1].text == tokens[1]["orth"] diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 61f7c3db0..8d57b791f 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -11,7 +11,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport TAG, MORPH +from ..attrs cimport MORPH from ..vocab cimport Vocab from .underscore import is_writable_attr @@ -365,8 +365,6 @@ def _split(Doc doc, int token_index, orths, heads, attrs): doc[token_index + i]._.set(ext_attr_key, ext_attr_value) # NB: We need to call get_string_id here because only the keys are # "intified" (since we support "KEY": [value, value] syntax here). - elif attr_name == TAG: - doc.vocab.morphology.assign_tag(token, get_string_id(attr_value)) else: # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate @@ -431,8 +429,6 @@ def set_token_attrs(Token py_token, attrs): if attr_name == "_": # Set extension attributes for ext_attr_key, ext_attr_value in attr_value.items(): py_token._.set(ext_attr_key, ext_attr_value) - elif attr_name == TAG: - doc.vocab.morphology.assign_tag(token, attr_value) else: # Set attributes on both token and lexeme to take care of token # attribute vs. lexical attribute without having to enumerate diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 935af88d1..15dafb86d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -832,13 +832,6 @@ cdef class Doc: rel_head_index=abs_head_index-i ) ) - # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA - if TAG in attrs: - col = attrs.index(TAG) - for i in range(length): - value = values[col * stride + i] - if value != 0: - self.vocab.morphology.assign_tag(&tokens[i], value) # Verify ENT_IOB are proper integers if ENT_IOB in attrs: iob_strings = Token.iob_strings() @@ -857,12 +850,11 @@ cdef class Doc: for i in range(length): token = &self.c[i] for j in range(n_attrs): - if attr_ids[j] != TAG: - value = values[j * stride + i] - if attr_ids[j] == MORPH: - # add morph to morphology table - self.vocab.morphology.add(self.vocab.strings[value]) - Token.set_struct_attr(token, attr_ids[j], value) + value = values[j * stride + i] + if attr_ids[j] == MORPH: + # add morph to morphology table + self.vocab.morphology.add(self.vocab.strings[value]) + Token.set_struct_attr(token, attr_ids[j], value) # Set flags self.is_parsed = bool(self.is_parsed or HEAD in attrs) self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a187c9722..9ad57e21b 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -332,11 +332,7 @@ cdef class Token: inflectional suffixes. """ def __get__(self): - if self.c.lemma == 0: - lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) - return self.vocab.strings[lemma_] - else: - return self.c.lemma + return self.c.lemma def __set__(self, attr_t lemma): self.c.lemma = lemma @@ -355,7 +351,7 @@ cdef class Token: return self.c.tag def __set__(self, attr_t tag): - self.vocab.morphology.assign_tag(self.c, tag) + self.c.tag = tag property dep: """RETURNS (uint64): ID of syntactic dependency label.""" @@ -888,10 +884,7 @@ cdef class Token: with no inflectional suffixes. """ def __get__(self): - if self.c.lemma == 0: - return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth) - else: - return self.vocab.strings[self.c.lemma] + return self.vocab.strings[self.c.lemma] def __set__(self, unicode lemma_): self.c.lemma = self.vocab.strings.add(lemma_) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ce95786f2..9e14f37d2 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -9,11 +9,10 @@ from .lexeme cimport EMPTY_LEXEME, OOV_RANK from .lexeme cimport Lexeme from .typedefs cimport attr_t from .tokens.token cimport Token -from .attrs cimport LANG, ORTH, TAG, POS +from .attrs cimport LANG, ORTH from .compat import copy_reg from .errors import Errors -from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM, IS_STOP from .vectors import Vectors from .util import registry @@ -23,7 +22,7 @@ from .lang.norm_exceptions import BASE_NORMS from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang -def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=True): +def create_vocab(lang, defaults, vectors_name=None, load_data=True): # If the spacy-lookups-data package is installed, we pre-populate the lookups # with lexeme data, if available if load_data: @@ -43,7 +42,6 @@ def create_vocab(lang, defaults, lemmatizer=None, vectors_name=None, load_data=T ) return Vocab( lex_attr_getters=lex_attrs, - lemmatizer=lemmatizer, lookups=lookups, writing_system=defaults.writing_system, get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), @@ -58,17 +56,13 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab """ - def __init__(self, lex_attr_getters=None, lemmatizer=None, - strings=tuple(), lookups=None, tag_map={}, + def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None, writing_system={}, get_noun_chunks=None, **deprecated_kwargs): """Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. - tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained - parts-of-speech, and optionally morphological attributes. - lemmatizer (object): A lemmatizer. Defaults to `None`. strings (StringStore): StringStore that maps strings to integers, and vice versa. lookups (Lookups): Container for large lookup tables and dictionaries. @@ -78,8 +72,6 @@ cdef class Vocab: lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} if lookups in (None, True, False): lookups = Lookups() - if lemmatizer in (None, True, False): - lemmatizer = Lemmatizer(lookups) self.cfg = {'oov_prob': oov_prob} self.mem = Pool() self._by_orth = PreshMap() @@ -89,7 +81,7 @@ cdef class Vocab: for string in strings: _ = self[string] self.lex_attr_getters = lex_attr_getters - self.morphology = Morphology(self.strings, tag_map, lemmatizer) + self.morphology = Morphology(self.strings) self.vectors = Vectors(name=vectors_name) self.lookups = lookups self.writing_system = writing_system @@ -268,12 +260,6 @@ cdef class Vocab: # Set the special tokens up to have arbitrary attributes lex = self.get_by_orth(self.mem, props[ORTH]) token.lex = lex - if TAG in props: - self.morphology.assign_tag(token, props[TAG]) - elif POS in props: - # Don't allow POS to be set without TAG -- this causes problems, - # see #1773 - props.pop(POS) for attr_id, value in props.items(): Token.set_struct_attr(token, attr_id, value) # NORM is the only one that overlaps between the two diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md index 73f8aa71f..b6a9c80b5 100644 --- a/website/docs/api/lemmatizer.md +++ b/website/docs/api/lemmatizer.md @@ -1,102 +1,263 @@ --- title: Lemmatizer -teaser: Assign the base forms of words tag: class -source: spacy/lemmatizer.py +source: spacy/pipeline/lemmatizer.py +new: 3 +teaser: 'Pipeline component for lemmatization' +api_base_class: /api/pipe +api_string_name: lemmatizer +api_trainable: false --- - +## Config and implementation -The `Lemmatizer` supports simple part-of-speech-sensitive suffix rules and -lookup tables. +The default config is defined by the pipeline component factory and describes +how the component should be configured. You can override its settings via the +`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your +[`config.cfg` for training](/usage/training#config). + +For examples of the lookups data formats used by the lookup and rule-based +lemmatizers, see the +[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo. + +> #### Example +> +> ```python +> config = {"mode": "rule"} +> nlp.add_pipe("lemmatizer", config=config) +> ``` + +| Setting | Type | Description | Default | +| ----------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------- | +| `mode` | str | The lemmatizer mode, e.g. "lookup" or "rule". | `"lookup"` | +| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | +| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | + +```python +https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py +``` ## Lemmatizer.\_\_init\_\_ {#init tag="method"} -Initialize a `Lemmatizer`. Typically, this happens under the hood within spaCy -when a `Language` subclass and its `Vocab` is initialized. - > #### Example > > ```python -> from spacy.lemmatizer import Lemmatizer -> from spacy.lookups import Lookups -> lookups = Lookups() -> lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) -> lemmatizer = Lemmatizer(lookups) -> ``` +> # Construction via add_pipe with default model +> lemmatizer = nlp.add_pipe("lemmatizer") > -> For examples of the data format, see the -> [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo. +> # Construction via add_pipe with custom settings +> config = {"mode": "rule", overwrite=True} +> lemmatizer = nlp.add_pipe("lemmatizer", config=config) +> ``` -| Name | Type | Description | -| -------------------------------------- | ------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the (optional) tables `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. | +Create a new pipeline instance. In your application, you would normally use a +shortcut for this and instantiate the component using its string name and +[`nlp.add_pipe`](/api/language#add_pipe). + +| Name | Type | Description | +| -------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | [`Vocab`](/api/vocab) | The vocab. | +| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). | +| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | +| _keyword-only_ | | | +| mode | str | The lemmatizer mode, e.g. "lookup" or "rule". Defaults to "lookup". | +| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as "lemma_rules", "lemma_index", "lemma_exc" and "lemma_lookup". Defaults to `None`. | +| overwrite | bool | Whether to overwrite existing lemmas. | ## Lemmatizer.\_\_call\_\_ {#call tag="method"} -Lemmatize a string. +Apply the pipe to one document. The document is modified in place, and returned. +This usually happens under the hood when the `nlp` object is called on a text +and all pipeline components are applied to the `Doc` in order. > #### Example > > ```python -> from spacy.lemmatizer import Lemmatizer -> from spacy.lookups import Lookups -> lookups = Lookups() -> lookups.add_table("lemma_rules", {"noun": [["s", ""]]}) -> lemmatizer = Lemmatizer(lookups) -> lemmas = lemmatizer("ducks", "NOUN") -> assert lemmas == ["duck"] +> doc = nlp("This is a sentence.") +> lemmatizer = nlp.add_pipe("lemmatizer") +> # This usually happens under the hood +> processed = lemmatizer(doc) > ``` -| Name | Type | Description | -| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- | -| `string` | str | The string to lemmatize, e.g. the token text. | -| `univ_pos` | str / int | The token's universal part-of-speech tag. | -| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. | -| **RETURNS** | list | The available lemmas for the string. | +| Name | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | The processed document. | -## Lemmatizer.lookup {#lookup tag="method" new="2"} +## Lemmatizer.pipe {#pipe tag="method"} -Look up a lemma in the lookup table, if available. If no lemma is found, the +Apply the pipe to a stream of documents. This usually happens under the hood +when the `nlp` object is called on a text and all pipeline components are +applied to the `Doc` in order. + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> for doc in lemmatizer.pipe(docs, batch_size=50): +> pass +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------ | +| `stream` | `Iterable[Doc]` | A stream of documents. | +| _keyword-only_ | | | +| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | +| **YIELDS** | `Doc` | Processed documents in the order of the original text. | + +## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} + +Lemmatize a token using a lookup-based approach. If no lemma is found, the original string is returned. Languages can provide a [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`. -> #### Example -> -> ```python -> lookups = Lookups() -> lookups.add_table("lemma_lookup", {"going": "go"}) -> assert lemmatizer.lookup("going") == "go" -> ``` +| Name | Type | Description | +| ----------- | --------------------- | ------------------------------------- | +| `token` | [`Token`](/api/token) | The token to lemmatize. | +| **RETURNS** | `List[str]` | A list containing one or more lemmas. | -| Name | Type | Description | -| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- | -| `string` | str | The string to look up. | -| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | -| **RETURNS** | str | The lemma if the string was found, otherwise the original string. | +## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"} + +Lemmatize a token using a rule-based approach. Typically relies on POS tags. + +| Name | Type | Description | +| ----------- | --------------------- | ------------------------------------- | +| `token` | [`Token`](/api/token) | The token to lemmatize. | +| **RETURNS** | `List[str]` | A list containing one or more lemmas. | ## Lemmatizer.is_base_form {#is_base_form tag="method"} Check whether we're dealing with an uninflected paradigm, so we can avoid lemmatization entirely. +| Name | Type | Description | +| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- | +| `token` | [`Token`](/api/token) | The token to analyze. | +| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. | + +## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"} + +Returns the lookups configuration settings for a given mode for use in +[`Lemmatizer.load_lookups`](#load_lookups). + +| Name | Type | Description | +| ----------- | ---- | ------------------------------------------------- | +| `mode` | str | The lemmatizer mode. | +| **RETURNS** | dict | The lookups configuration settings for this mode. | + +## Lemmatizer.load_lookups {#load_lookups tag="classmethod"} + +Load and validate lookups tables. If the provided lookups is `None`, load the +default lookups tables according to the language and mode settings. Confirm that +all required tables for the language and mode are present. + +| Name | Type | Description | +| ----------- | ------------------------- | ---------------------------------------------------------------------------- | +| `lang` | str | The language. | +| `mode` | str | The lemmatizer mode. | +| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. | +| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. | + +## Lemmatizer.to_disk {#to_disk tag="method"} + +Serialize the pipe to disk. + > #### Example > > ```python -> pos = "verb" -> morph = {"VerbForm": "inf"} -> is_base_form = lemmatizer.is_base_form(pos, morph) -> assert is_base_form == True +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.to_disk("/path/to/lemmatizer") > ``` -| Name | Type | Description | -| ------------ | --------- | --------------------------------------------------------------------------------------- | -| `univ_pos` | str / int | The token's universal part-of-speech tag. | -| `morphology` | dict | The token's morphological features. | -| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | +| Name | Type | Description | +| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | + +## Lemmatizer.from_disk {#from_disk tag="method"} + +Load the pipe from disk. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.from_disk("/path/to/lemmatizer") +> ``` + +| Name | Type | Description | +| -------------- | --------------- | -------------------------------------------------------------------------- | +| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. | + +## Lemmatizer.to_bytes {#to_bytes tag="method"} + +> #### Example +> +> ```python +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer_bytes = lemmatizer.to_bytes() +> ``` + +Serialize the pipe to a bytestring. + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. | + +## Lemmatizer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> lemmatizer_bytes = lemmatizer.to_bytes() +> lemmatizer = nlp.add_pipe("lemmatizer") +> lemmatizer.from_bytes(lemmatizer_bytes) +> ``` + +| Name | Type | Description | +| -------------- | --------------- | ------------------------------------------------------------------------- | +| `bytes_data` | bytes | The data to load from. | +| _keyword-only_ | | | +| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | +| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. | + +## Lemmatizer.mode {#mode tag="property"} + +The lemmatizer mode. + +| Name | Type | Description | +| ----------- | ----- | -------------------- | +| **RETURNS** | `str` | The lemmatizer mode. | ## Attributes {#attributes} -| Name | Type | Description | -| -------------------------------------- | ------------------------- | --------------------------------------------------------------- | -| `lookups` 2.2 | [`Lookups`](/api/lookups) | The lookups object containing the rules and data, if available. | +| Name | Type | Description | +| --------- | --------------------------------- | ------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `lookups` | [`Lookups`](/api/lookups) | The lookups object. | + +## Serialization fields {#serialization-fields} + +During serialization, spaCy will export several data fields used to restore +different aspects of the object. If needed, you can exclude them from +serialization by passing in the string names via the `exclude` argument. + +> #### Example +> +> ```python +> data = lemmatizer.to_disk("/path", exclude=["vocab"]) +> ``` + +| Name | Description | +| --------- | ---------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `lookups` | The lookups. You usually don't want to exclude this. | diff --git a/website/docs/api/morphology.md b/website/docs/api/morphology.md index 8fb89c15f..3c5bf6fe4 100644 --- a/website/docs/api/morphology.md +++ b/website/docs/api/morphology.md @@ -11,22 +11,19 @@ this class. ## Morphology.\_\_init\_\_ {#init tag="method"} -Create a Morphology object using the tag map, lemmatizer and exceptions. +Create a Morphology object. > #### Example > > ```python > from spacy.morphology import Morphology > -> morphology = Morphology(strings, tag_map, lemmatizer) +> morphology = Morphology(strings) > ``` -| Name | Type | Description | -| ------------ | ----------------- | ---------------------------------------------------------------------------------------------------------- | -| `strings` | `StringStore` | The string store. | -| `tag_map` | `Dict[str, Dict]` | The tag map. | -| `lemmatizer` | `Lemmatizer` | The lemmatizer. | -| `exc` | `Dict[str, Dict]` | A dictionary of exceptions in the format `{tag: {orth: {"POS": "X", "Feat1": "Val1, "Feat2": "Val2", ...}` | +| Name | Type | Description | +| --------- | ------------- | ----------------- | +| `strings` | `StringStore` | The string store. | ## Morphology.add {#add tag="method"} @@ -62,52 +59,6 @@ Get the FEATS string for the hash of the morphological analysis. | ------- | ---- | --------------------------------------- | | `morph` | int | The hash of the morphological analysis. | -## Morphology.load_tag_map {#load_tag_map tag="method"} - -Replace the current tag map with the provided tag map. - -| Name | Type | Description | -| --------- | ----------------- | ------------ | -| `tag_map` | `Dict[str, Dict]` | The tag map. | - -## Morphology.load_morph_exceptions {#load_morph_exceptions tag="method"} - -Replace the current morphological exceptions with the provided exceptions. - -| Name | Type | Description | -| ------------- | ----------------- | ----------------------------- | -| `morph_rules` | `Dict[str, Dict]` | The morphological exceptions. | - -## Morphology.add_special_case {#add_special_case tag="method"} - -Add a special-case rule to the morphological analyzer. Tokens whose tag and orth -match the rule will receive the specified properties. - -> #### Example -> -> ```python -> attrs = {"POS": "DET", "Definite": "Def"} -> morphology.add_special_case("DT", "the", attrs) -> ``` - -| Name | Type | Description | -| ---------- | ---- | ---------------------------------------------- | -| `tag_str` | str | The fine-grained tag. | -| `orth_str` | str | The token text. | -| `attrs` | dict | The features to assign for this token and tag. | - -## Morphology.exc {#exc tag="property"} - -The current morphological exceptions. - -| Name | Type | Description | -| ---------- | ---- | --------------------------------------------------- | -| **YIELDS** | dict | The current dictionary of morphological exceptions. | - -## Morphology.lemmatize {#lemmatize tag="method"} - -TODO - ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} Convert a string FEATS representation to a dictionary of features and values in diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md index 37ef13453..d9b8f4caf 100644 --- a/website/docs/api/tagger.md +++ b/website/docs/api/tagger.md @@ -47,7 +47,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx > > # Construction via create_pipe with custom model > config = {"model": {"@architectures": "my_tagger"}} -> parser = nlp.add_pipe("tagger", config=config) +> tagger = nlp.add_pipe("tagger", config=config) > > # Construction from class > from spacy.pipeline import Tagger @@ -285,16 +285,14 @@ Add a new label to the pipe. > #### Example > > ```python -> from spacy.symbols import POS > tagger = nlp.add_pipe("tagger") -> tagger.add_label("MY_LABEL", {POS: "NOUN"}) +> tagger.add_label("MY_LABEL") > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------- | -| `label` | str | The label to add. | -| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. | -| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | +| Name | Type | Description | +| ----------- | ---- | --------------------------------------------------- | +| `label` | str | The label to add. | +| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | ## Tagger.to_disk {#to_disk tag="method"} @@ -369,9 +367,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it. ## Tagger.labels {#labels tag="property"} -The labels currently added to the component. Note that even for a blank -component, this will always include the built-in coarse-grained part-of-speech -tags by default, e.g. `VERB`, `NOUN` and so on. +The labels currently added to the component. > #### Example > @@ -396,9 +392,8 @@ serialization by passing in the string names via the `exclude` argument. > data = tagger.to_disk("/path", exclude=["vocab"]) > ``` -| Name | Description | -| --------- | ------------------------------------------------------------------------------------------ | -| `vocab` | The shared [`Vocab`](/api/vocab). | -| `cfg` | The config file. You usually don't want to exclude this. | -| `model` | The binary model data. You usually don't want to exclude this. | -| `tag_map` | The [tag map](/usage/adding-languages#tag-map) mapping fine-grained to coarse-grained tag. | +| Name | Description | +| ------- | -------------------------------------------------------------- | +| `vocab` | The shared [`Vocab`](/api/vocab). | +| `cfg` | The config file. You usually don't want to exclude this. | +| `model` | The binary model data. You usually don't want to exclude this. | diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md index d5c9b0ff0..7e77762bb 100644 --- a/website/docs/api/vocab.md +++ b/website/docs/api/vocab.md @@ -24,8 +24,6 @@ Create the vocabulary. | Name | Type | Description | | -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. | -| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | -| `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | | `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | | `lookups_extra` 2.3 | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |