import pickle import pytest import spacy from spacy import registry, util from spacy.about import __lookups_url__ from spacy.lang.en import English from spacy.lookups import Lookups, load_lookups_data_from_url from ..util import make_tempdir @pytest.fixture def nlp(): @registry.misc("cope_lookups") def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups nlp = English() nlp.config["initialize"]["components"]["lemmatizer"] = { "lookups": {"@misc": "cope_lookups"} } return nlp def test_lemmatizer_init(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) assert isinstance(lemmatizer.lookups, Lookups) assert not lemmatizer.lookups.tables assert lemmatizer.mode == "lookup" with pytest.raises(ValueError): nlp("test") nlp.initialize() assert lemmatizer.lookups.tables assert nlp("cope")[0].lemma_ == "cope" assert nlp("coped")[0].lemma_ == "cope" # replace any tables from spacy-lookups-data lemmatizer.lookups = Lookups() # lookup with no tables sets text as lemma assert nlp("cope")[0].lemma_ == "cope" assert nlp("coped")[0].lemma_ == "coped" nlp.remove_pipe("lemmatizer") lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "lookup"}) with pytest.raises(ValueError): # Can't initialize without required tables lemmatizer.initialize(lookups=Lookups()) lookups = Lookups() lookups.add_table("lemma_lookup", {}) lemmatizer.initialize(lookups=lookups) def test_lemmatizer_config(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) nlp.initialize() # warning if no POS assigned doc = nlp.make_doc("coping") with pytest.warns(UserWarning): doc = lemmatizer(doc) # warns once by default doc = lemmatizer(doc) # works with POS doc = nlp.make_doc("coping") assert doc[0].lemma_ == "" doc[0].pos_ = "VERB" doc = lemmatizer(doc) doc = lemmatizer(doc) assert doc[0].text == "coping" assert doc[0].lemma_ == "cope" doc = nlp.make_doc("coping") doc[0].pos_ = "VERB" assert doc[0].lemma_ == "" doc = lemmatizer(doc) assert doc[0].text == "coping" assert doc[0].lemma_ == "cope" def test_lemmatizer_serialize(nlp): lemmatizer = nlp.add_pipe("lemmatizer", config={"mode": "rule"}) nlp.initialize() def cope_lookups(): lookups = Lookups() lookups.add_table("lemma_lookup", {"cope": "cope", "coped": "cope"}) lookups.add_table("lemma_index", {"verb": ("cope", "cop")}) lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}}) lookups.add_table("lemma_rules", {"verb": [["ing", ""]]}) return lookups nlp2 = English() lemmatizer2 = nlp2.add_pipe("lemmatizer", config={"mode": "rule"}) lemmatizer2.initialize(lookups=cope_lookups()) lemmatizer2.from_bytes(lemmatizer.to_bytes()) assert lemmatizer.to_bytes() == lemmatizer2.to_bytes() assert lemmatizer.lookups.tables == lemmatizer2.lookups.tables # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2.make_doc("coping") doc2[0].pos_ = "VERB" assert doc2[0].lemma_ == "" doc2 = lemmatizer2(doc2) assert doc2[0].text == "coping" assert doc2[0].lemma_ == "cope" # Make sure that lemmatizer cache can be pickled pickle.dumps(lemmatizer2) @pytest.mark.parametrize("lang", ("ca", "en")) def test_lemmatizer_load_lookups_from_url(lang): nlp = spacy.blank(lang) lemmatizer = nlp.add_pipe("lemmatizer") req_tables, opt_tables = lemmatizer.get_lookups_config(mode=lemmatizer.mode) lookups = load_lookups_data_from_url( nlp.lang, req_tables + opt_tables, __lookups_url__ ) lemmatizer.initialize(lookups=lookups) assert set(lemmatizer.lookups.tables) == set(req_tables + opt_tables)