spaCy/spacy/lang/nl/lemmatizer.py
Adriane Boyd e962784531
Add Lemmatizer and simplify related components (#5848)
* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
2020-08-07 15:27:13 +02:00

129 lines
4.6 KiB
Python

from typing import List, Dict
from ...pipeline import Lemmatizer
from ...tokens import Token
class DutchLemmatizer(Lemmatizer):
@classmethod
def get_lookups_config(cls, mode: str) -> Dict:
if mode == "rule":
return {
"required_tables": [
"lemma_lookup",
"lemma_rules",
"lemma_exc",
"lemma_index",
],
}
else:
return super().get_lookups_config(mode)
def lookup_lemmatize(self, token: Token) -> List[str]:
"""Overrides parent method so that a lowercased version of the string
is used to search the lookup table. This is necessary because our
lookup table consists entirely of lowercase keys."""
lookup_table = self.lookups.get_table("lemma_lookup", {})
string = token.text.lower()
return [lookup_table.get(string, string)]
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
def rule_lemmatize(self, token: Token) -> List[str]:
# Difference 1: self.rules is assumed to be non-None, so no
# 'is None' check required.
# String lowercased from the get-go. All lemmatization results in
# lowercased strings. For most applications, this shouldn't pose
# any problems, and it keeps the exceptions indexes small. If this
# creates problems for proper nouns, we can introduce a check for
# univ_pos == "PROPN".
cache_key = (token.lower, token.pos)
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
univ_pos = token.pos_.lower()
if univ_pos in ("", "eol", "space"):
forms = [string.lower()]
self.cache[cache_key] = forms
return forms
index_table = self.lookups.get_table("lemma_index", {})
exc_table = self.lookups.get_table("lemma_exc", {})
rules_table = self.lookups.get_table("lemma_rules", {})
index = index_table.get(univ_pos, {})
exceptions = exc_table.get(univ_pos, {})
rules = rules_table.get(univ_pos, {})
string = string.lower()
if univ_pos not in (
"noun",
"verb",
"aux",
"adj",
"adv",
"pron",
"det",
"adp",
"num",
):
forms = [string]
self.cache[cache_key] = forms
return forms
lemma_index = index_table.get(univ_pos, {})
# string is already lemma
if string in lemma_index:
forms = [string]
self.cache[cache_key] = forms
return forms
exc_table = self.lookups.get_table("lemma_exc", {})
exceptions = exc_table.get(univ_pos, {})
# string is irregular token contained in exceptions index.
try:
forms = [exceptions[string][0]]
self.cache[cache_key] = forms
return forms
except KeyError:
pass
# string corresponds to key in lookup table
lookup_table = self.lookups.get_table("lemma_lookup", {})
looked_up_lemma = lookup_table.get(string)
if looked_up_lemma and looked_up_lemma in lemma_index:
forms = [looked_up_lemma]
self.cache[cache_key] = forms
return forms
rules_table = self.lookups.get_table("lemma_rules", {})
oov_forms = []
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index:
forms = [form]
self.cache[cache_key] = forms
return forms
else:
oov_forms.append(form)
forms = list(set(oov_forms))
# Back-off through remaining return value candidates.
if forms:
for form in forms:
if form in exceptions:
forms = [form]
self.cache[cache_key] = forms
return forms
if looked_up_lemma:
forms = [looked_up_lemma]
self.cache[cache_key] = forms
return forms
else:
self.cache[cache_key] = forms
return forms
elif looked_up_lemma:
forms = [looked_up_lemma]
self.cache[cache_key] = forms
return forms
else:
forms = [string]
self.cache[cache_key] = forms
return forms