spaCy/spacy/tests/lang/nl/test_lemmatizer.py

# coding: utf-8
from __future__ import unicode_literals

import pytest


# Calling the Lemmatizer directly
# Imitates behavior of:
# Tagger.set_annotations()
# -> vocab.morphology.assign_tag_id()
# -> vocab.morphology.assign_tag_id()
#   -> Token.tag.__set__
#     -> vocab.morphology.assign_tag(...)
#       -> ... ->  Morphology.assign_tag(...)
#         -> self.lemmatize(analysis.tag.pos, token.lex.orth,


noun_irreg_lemmatization_cases = [
    ("volkeren", "volk"),
    ("vaatje", "vat"),
    ("verboden", "verbod"),
    ("ijsje", "ijsje"),
    ("slagen", "slag"),
    ("verdragen", "verdrag"),
    ("verloven", "verlof"),
    ("gebeden", "gebed"),
    ("gaten", "gat"),
    ("staven", "staf"),
    ("aquariums", "aquarium"),
    ("podia", "podium"),
    ("holen", "hol"),
    ("lammeren", "lam"),
    ("bevelen", "bevel"),
    ("wegen", "weg"),
    ("moeilijkheden", "moeilijkheid"),
    ("aanwezigheden", "aanwezigheid"),
    ("goden", "god"),
    ("loten", "lot"),
    ("kaarsen", "kaars"),
    ("leden", "lid"),
    ("glaasje", "glas"),
    ("eieren", "ei"),
    ("vatten", "vat"),
    ("kalveren", "kalf"),
    ("padden", "pad"),
    ("smeden", "smid"),
    ("genen", "gen"),
    ("beenderen", "been"),
]


verb_irreg_lemmatization_cases = [
    ("liep", "lopen"),
    ("hief", "heffen"),
    ("begon", "beginnen"),
    ("sla", "slaan"),
    ("aangekomen", "aankomen"),
    ("sproot", "spruiten"),
    ("waart", "zijn"),
    ("snoof", "snuiven"),
    ("spoot", "spuiten"),
    ("ontbeet", "ontbijten"),
    ("gehouwen", "houwen"),
    ("afgewassen", "afwassen"),
    ("deed", "doen"),
    ("schoven", "schuiven"),
    ("gelogen", "liegen"),
    ("woog", "wegen"),
    ("gebraden", "braden"),
    ("smolten", "smelten"),
    ("riep", "roepen"),
    ("aangedaan", "aandoen"),
    ("vermeden", "vermijden"),
    ("stootten", "stoten"),
    ("ging", "gaan"),
    ("geschoren", "scheren"),
    ("gesponnen", "spinnen"),
    ("reden", "rijden"),
    ("zochten", "zoeken"),
    ("leed", "lijden"),
    ("verzonnen", "verzinnen"),
]


@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
def test_nl_lemmatizer_noun_lemmas_irreg(nl_lemmatizer, text, lemma):
    pos = "noun"
    lemmas_pred = nl_lemmatizer(text, pos)
    assert lemma == sorted(lemmas_pred)[0]


@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
def test_nl_lemmatizer_verb_lemmas_irreg(nl_lemmatizer, text, lemma):
    pos = "verb"
    lemmas_pred = nl_lemmatizer(text, pos)
    assert lemma == sorted(lemmas_pred)[0]


@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_verb_lemmas_reg(nl_lemmatizer, text, lemma):
    # TODO: add test
    pass


@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_adjective_lemmas(nl_lemmatizer, text, lemma):
    # TODO: add test
    pass


@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_determiner_lemmas(nl_lemmatizer, text, lemma):
    # TODO: add test
    pass


@pytest.mark.skip
@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_adverb_lemmas(nl_lemmatizer, text, lemma):
    # TODO: add test
    pass


@pytest.mark.parametrize("text,lemma", [])
def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma):
    # TODO: add test
    pass


# Using the lemma lookup table only
@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)
def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):
    lemma_pred = nl_lemmatizer.lookup(None, text)
    assert lemma_pred in (lemma, text)


@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)
def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):
    lemma_pred = nl_lemmatizer.lookup(None, text)
    assert lemma_pred in (lemma, text)
Improved Dutch language resources and Dutch lemmatization (#3409) * Improved Dutch language resources and Dutch lemmatization * Fix conftest * Update punctuation.py * Auto-format * Format and fix tests * Remove unused test file * Re-add deleted test * removed redundant infix regex pattern for ','; note: brackets + simple hyphen remains * Cleaner lemmatization files 2019-04-03 15:13:26 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`import pytest`


			`# Calling the Lemmatizer directly`
			`# Imitates behavior of:`
			`# Tagger.set_annotations()`
			`# -> vocab.morphology.assign_tag_id()`
			`# -> vocab.morphology.assign_tag_id()`
			`# -> Token.tag.__set__`
			`# -> vocab.morphology.assign_tag(...)`
			`# -> ... -> Morphology.assign_tag(...)`
			`# -> self.lemmatize(analysis.tag.pos, token.lex.orth,`


			`noun_irreg_lemmatization_cases = [`
			`("volkeren", "volk"),`
			`("vaatje", "vat"),`
			`("verboden", "verbod"),`
			`("ijsje", "ijsje"),`
			`("slagen", "slag"),`
			`("verdragen", "verdrag"),`
			`("verloven", "verlof"),`
			`("gebeden", "gebed"),`
			`("gaten", "gat"),`
			`("staven", "staf"),`
			`("aquariums", "aquarium"),`
			`("podia", "podium"),`
			`("holen", "hol"),`
			`("lammeren", "lam"),`
			`("bevelen", "bevel"),`
			`("wegen", "weg"),`
			`("moeilijkheden", "moeilijkheid"),`
			`("aanwezigheden", "aanwezigheid"),`
			`("goden", "god"),`
			`("loten", "lot"),`
			`("kaarsen", "kaars"),`
			`("leden", "lid"),`
			`("glaasje", "glas"),`
			`("eieren", "ei"),`
			`("vatten", "vat"),`
			`("kalveren", "kalf"),`
			`("padden", "pad"),`
			`("smeden", "smid"),`
			`("genen", "gen"),`
			`("beenderen", "been"),`
			`]`


			`verb_irreg_lemmatization_cases = [`
			`("liep", "lopen"),`
			`("hief", "heffen"),`
			`("begon", "beginnen"),`
			`("sla", "slaan"),`
			`("aangekomen", "aankomen"),`
			`("sproot", "spruiten"),`
			`("waart", "zijn"),`
			`("snoof", "snuiven"),`
			`("spoot", "spuiten"),`
			`("ontbeet", "ontbijten"),`
			`("gehouwen", "houwen"),`
			`("afgewassen", "afwassen"),`
			`("deed", "doen"),`
			`("schoven", "schuiven"),`
			`("gelogen", "liegen"),`
			`("woog", "wegen"),`
			`("gebraden", "braden"),`
			`("smolten", "smelten"),`
			`("riep", "roepen"),`
			`("aangedaan", "aandoen"),`
			`("vermeden", "vermijden"),`
			`("stootten", "stoten"),`
			`("ging", "gaan"),`
			`("geschoren", "scheren"),`
			`("gesponnen", "spinnen"),`
			`("reden", "rijden"),`
			`("zochten", "zoeken"),`
			`("leed", "lijden"),`
			`("verzonnen", "verzinnen"),`
			`]`


			`@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)`
			`def test_nl_lemmatizer_noun_lemmas_irreg(nl_lemmatizer, text, lemma):`
			`pos = "noun"`
			`lemmas_pred = nl_lemmatizer(text, pos)`
			`assert lemma == sorted(lemmas_pred)[0]`


			`@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)`
			`def test_nl_lemmatizer_verb_lemmas_irreg(nl_lemmatizer, text, lemma):`
			`pos = "verb"`
			`lemmas_pred = nl_lemmatizer(text, pos)`
			`assert lemma == sorted(lemmas_pred)[0]`


			`@pytest.mark.skip`
			`@pytest.mark.parametrize("text,lemma", [])`
			`def test_nl_lemmatizer_verb_lemmas_reg(nl_lemmatizer, text, lemma):`
			`# TODO: add test`
			`pass`


			`@pytest.mark.skip`
			`@pytest.mark.parametrize("text,lemma", [])`
			`def test_nl_lemmatizer_adjective_lemmas(nl_lemmatizer, text, lemma):`
			`# TODO: add test`
			`pass`


			`@pytest.mark.skip`
			`@pytest.mark.parametrize("text,lemma", [])`
			`def test_nl_lemmatizer_determiner_lemmas(nl_lemmatizer, text, lemma):`
			`# TODO: add test`
			`pass`


			`@pytest.mark.skip`
			`@pytest.mark.parametrize("text,lemma", [])`
			`def test_nl_lemmatizer_adverb_lemmas(nl_lemmatizer, text, lemma):`
			`# TODO: add test`
			`pass`


			`@pytest.mark.parametrize("text,lemma", [])`
			`def test_nl_lemmatizer_pronoun_lemmas(nl_lemmatizer, text, lemma):`
			`# TODO: add test`
			`pass`


			`# Using the lemma lookup table only`
			`@pytest.mark.parametrize("text,lemma", noun_irreg_lemmatization_cases)`
			`def test_nl_lemmatizer_lookup_noun(nl_lemmatizer, text, lemma):`
Bloom-filter backed Lookup Tables (#4268) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately. 2019-09-12 18:26:11 +03:00			`lemma_pred = nl_lemmatizer.lookup(None, text)`
Improved Dutch language resources and Dutch lemmatization (#3409) * Improved Dutch language resources and Dutch lemmatization * Fix conftest * Update punctuation.py * Auto-format * Format and fix tests * Remove unused test file * Re-add deleted test * removed redundant infix regex pattern for ','; note: brackets + simple hyphen remains * Cleaner lemmatization files 2019-04-03 15:13:26 +03:00			`assert lemma_pred in (lemma, text)`


			`@pytest.mark.parametrize("text,lemma", verb_irreg_lemmatization_cases)`
			`def test_nl_lemmatizer_lookup_verb(nl_lemmatizer, text, lemma):`
Bloom-filter backed Lookup Tables (#4268) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately. 2019-09-12 18:26:11 +03:00			`lemma_pred = nl_lemmatizer.lookup(None, text)`
Improved Dutch language resources and Dutch lemmatization (#3409) * Improved Dutch language resources and Dutch lemmatization * Fix conftest * Update punctuation.py * Auto-format * Format and fix tests * Remove unused test file * Re-add deleted test * removed redundant infix regex pattern for ','; note: brackets + simple hyphen remains * Cleaner lemmatization files 2019-04-03 15:13:26 +03:00			`assert lemma_pred in (lemma, text)`