spaCy/spacy/tests/regression/test_issue3501-4000.py
Adriane Boyd e962784531
Add Lemmatizer and simplify related components (#5848)
* Add Lemmatizer and simplify related components

* Add `Lemmatizer` pipe with `lookup` and `rule` modes using the
`Lookups` tables.
* Reduce `Tagger` to a simple tagger that sets `Token.tag` (no pos or lemma)
* Reduce `Morphology` to only keep track of morph tags (no tag map, lemmatizer,
or morph rules)
* Remove lemmatizer from `Vocab`
* Adjust many many tests

Differences:

* No default lookup lemmas
* No special treatment of TAG in `from_array` and similar required
* Easier to modify labels in a `Tagger`
* No extra strings added from morphology / tag map

* Fix test

* Initial fix for Lemmatizer config/serialization

* Adjust init test to be more generic

* Adjust init test to force empty Lookups

* Add simple cache to rule-based lemmatizer

* Convert language-specific lemmatizers

Convert language-specific lemmatizers to component lemmatizers. Remove
previous lemmatizer class.

* Fix French and Polish lemmatizers

* Remove outdated UPOS conversions

* Update Russian lemmatizer init in tests

* Add minimal init/run tests for custom lemmatizers

* Add option to overwrite existing lemmas

* Update mode setting, lookup loading, and caching

* Make `mode` an immutable property
* Only enforce strict `load_lookups` for known supported modes
* Move caching into individual `_lemmatize` methods

* Implement strict when lang is not found in lookups

* Fix tables/lookups in make_lemmatizer

* Reallow provided lookups and allow for stricter checks

* Add lookups asset to all Lemmatizer pipe tests

* Rename lookups in lemmatizer init test

* Clean up merge

* Refactor lookup table loading

* Add helper from `load_lemmatizer_lookups` that loads required and
optional lookups tables based on settings provided by a config.

Additional slight refactor of lookups:

* Add `Lookups.set_table` to set a table from a provided `Table`
* Reorder class definitions to be able to specify type as `Table`

* Move registry assets into test methods

* Refactor lookups tables config

Use class methods within `Lemmatizer` to provide the config for
particular modes and to load the lookups from a config.

* Add pipe and score to lemmatizer

* Simplify Tagger.score

* Add missing import

* Clean up imports and auto-format

* Remove unused kwarg

* Tidy up and auto-format

* Update docstrings for Lemmatizer

Update docstrings for Lemmatizer.

Additionally modify `is_base_form` API to take `Token` instead of
individual features.

* Update docstrings

* Remove tag map values from Tagger.add_label

* Update API docs

* Fix relative link in Lemmatizer API docs
2020-08-07 15:27:13 +02:00

480 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytest
from spacy.language import Language
from spacy.vocab import Vocab
from spacy.pipeline import EntityRuler, DependencyParser
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy import displacy, load
from spacy.displacy import parse_deps
from spacy.tokens import Doc, Token
from spacy.matcher import Matcher, PhraseMatcher
from spacy.errors import MatchPatternError
from spacy.util import minibatch
from spacy.gold import Example
from spacy.lang.hi import Hindi
from spacy.lang.es import Spanish
from spacy.lang.en import English
from spacy.attrs import IS_ALPHA
from spacy import registry
from thinc.api import compounding
import spacy
import srsly
import numpy
from ..util import make_tempdir, get_doc
@pytest.mark.parametrize("word", ["don't", "dont", "I'd", "Id"])
def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop
def test_issue_3526_1(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
ruler_bytes = ruler.to_bytes()
assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4
assert ruler.overwrite
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(ruler_bytes)
assert len(new_ruler) == len(ruler)
assert len(new_ruler.labels) == 4
assert new_ruler.overwrite == ruler.overwrite
assert new_ruler.ent_id_sep == ruler.ent_id_sep
def test_issue_3526_2(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
new_ruler = EntityRuler(nlp)
new_ruler = new_ruler.from_bytes(bytes_old_style)
assert len(new_ruler) == len(ruler)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert new_ruler.overwrite is not ruler.overwrite
def test_issue_3526_3(en_vocab):
patterns = [
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
]
nlp = Language(vocab=en_vocab)
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
with make_tempdir() as tmpdir:
out_file = tmpdir / "entity_ruler"
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
new_ruler = EntityRuler(nlp).from_disk(out_file)
for pattern in ruler.patterns:
assert pattern in new_ruler.patterns
assert len(new_ruler) == len(ruler)
assert new_ruler.overwrite is not ruler.overwrite
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue_3526_4(en_vocab):
nlp = Language(vocab=en_vocab)
patterns = [{"label": "ORG", "pattern": "Apple"}]
config = {"overwrite_ents": True}
ruler = nlp.add_pipe("entity_ruler", config=config)
ruler.add_patterns(patterns)
with make_tempdir() as tmpdir:
nlp.to_disk(tmpdir)
ruler = nlp.get_pipe("entity_ruler")
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert ruler.overwrite is True
nlp2 = load(tmpdir)
new_ruler = nlp2.get_pipe("entity_ruler")
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
assert new_ruler.overwrite is True
def test_issue3531():
"""Test that displaCy renderer doesn't require "settings" key."""
example_dep = {
"words": [
{"text": "But", "tag": "CCONJ"},
{"text": "Google", "tag": "PROPN"},
{"text": "is", "tag": "VERB"},
{"text": "starting", "tag": "VERB"},
{"text": "from", "tag": "ADP"},
{"text": "behind.", "tag": "ADV"},
],
"arcs": [
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
],
}
example_ent = {
"text": "But Google is starting from behind.",
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
}
dep_html = displacy.render(example_dep, style="dep", manual=True)
assert dep_html
ent_html = displacy.render(example_ent, style="ent", manual=True)
assert ent_html
def test_issue3540(en_vocab):
words = ["I", "live", "in", "NewYork", "right", "now"]
tensor = numpy.asarray(
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
dtype="f",
)
doc = Doc(en_vocab, words=words)
doc.tensor = tensor
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
for i, lemma in enumerate(gold_lemma):
doc[i].lemma_ = lemma
assert [token.lemma_ for token in doc] == gold_lemma
vectors_1 = [token.vector for token in doc]
assert len(vectors_1) == len(doc)
with doc.retokenize() as retokenizer:
heads = [(doc[3], 1), doc[2]]
attrs = {"POS": ["PROPN", "PROPN"], "LEMMA": ["New", "York"], "DEP": ["pobj", "compound"]}
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.text for token in doc] == gold_text
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
assert [token.lemma_ for token in doc] == gold_lemma
vectors_2 = [token.vector for token in doc]
assert len(vectors_2) == len(doc)
assert vectors_1[0].tolist() == vectors_2[0].tolist()
assert vectors_1[1].tolist() == vectors_2[1].tolist()
assert vectors_1[2].tolist() == vectors_2[2].tolist()
assert vectors_1[4].tolist() == vectors_2[5].tolist()
assert vectors_1[5].tolist() == vectors_2[6].tolist()
def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("GOOD", [pattern])
with pytest.raises(MatchPatternError):
matcher.add("BAD", [[{"X": "Y"}]])
@pytest.mark.skip("Matching currently only works on strings and integers")
def test_issue3555(en_vocab):
"""Test that custom extensions with default None don't break matcher."""
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)
def test_issue3611():
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = ["offensive", "inoffensive"]
x_train = [
"This is an offensive text",
"This is the second offensive text",
"inoff",
]
y_train = ["offensive", "offensive", "inoffensive"]
nlp = spacy.blank("en")
# preparing the data
train_data = []
for text, train_instance in zip(x_train, y_train):
cat_dict = {label: label == train_instance for label in unique_classes}
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
# add a text categorizer component
model = {
"@architectures": "spacy.TextCatBOW.v1",
"exclusive_classes": True,
"ngram_size": 2,
"no_output_layer": False,
}
textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
for label in unique_classes:
textcat.add_label(label)
# training the network
with nlp.select_pipes(enable="textcat"):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
nlp.update(
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
)
def test_issue3625():
"""Test that default punctuation rules applies to hindi unicode characters"""
nlp = Hindi()
doc = nlp("hi. how हुए. होटल, होटल")
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
assert [token.text for token in doc] == expected
def test_issue3803():
"""Test that spanish num-like tokens have True for like_num attribute."""
nlp = Spanish()
text = "2 dos 1000 mil 12 doce"
doc = nlp(text)
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens"""
config = {
"learn_tokens": False,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
"model"
]
parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
assert "subtok" not in parser.labels
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3830_with_subtok():
"""Test that the parser does have subtok label if learn_tokens=True."""
config = {
"learn_tokens": True,
"min_action_freq": 30,
"update_with_oracle_cut_size": 100,
}
model = registry.make_from_config({"model": DEFAULT_PARSER_MODEL}, validate=True)[
"model"
]
parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj")
assert "subtok" not in parser.labels
parser.begin_training(lambda: [])
assert "subtok" in parser.labels
def test_issue3839(en_vocab):
"""Test that match IDs returned by the matcher are correct, are in the string """
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
matcher = Matcher(en_vocab)
match_id = "PATTERN"
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
matcher.add(match_id, [pattern1])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
matcher = Matcher(en_vocab)
matcher.add(match_id, [pattern2])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
@pytest.mark.parametrize(
"sentence",
[
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
],
)
def test_issue3869(sentence):
"""Test that the Doc's count_by function works consistently"""
nlp = English()
doc = nlp(sentence)
count = 0
for token in doc:
count += token.is_alpha
assert count == doc.count_by(IS_ALPHA).get(1, 0)
def test_issue3879(en_vocab):
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
assert len(doc) == 5
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
matcher = Matcher(en_vocab)
matcher.add("TEST", [pattern])
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch.
Fixed in v7.0.5 of Thinc.
"""
texts = ["hello", "world", "", ""]
nlp = English()
nlp.add_pipe("parser").add_label("dep")
nlp.add_pipe("ner").add_label("PERSON")
nlp.add_pipe("tagger").add_label("NN")
nlp.begin_training()
for doc in nlp.pipe(texts):
pass
def test_issue3882(en_vocab):
"""Test that displaCy doesn't serialize the doc.user_data when making a
copy of the Doc.
"""
doc = Doc(en_vocab, words=["Hello", "world"])
doc.is_parsed = True
doc.user_data["test"] = set()
parse_deps(doc)
def test_issue3951(en_vocab):
"""Test that combinations of optional rules are matched correctly."""
matcher = Matcher(en_vocab)
pattern = [
{"LOWER": "hello"},
{"LOWER": "this", "OP": "?"},
{"OP": "?"},
{"LOWER": "world"},
]
matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
matches = matcher(doc)
assert len(matches) == 0
def test_issue3959():
""" Ensure that a modified pos attribute is serialized correctly."""
nlp = English()
doc = nlp(
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
)
assert doc[0].pos_ == ""
doc[0].pos_ = "NOUN"
assert doc[0].pos_ == "NOUN"
# usually this is already True when starting from proper models instead of blank English
doc.is_tagged = True
with make_tempdir() as tmp_dir:
file_path = tmp_dir / "my_doc"
doc.to_disk(file_path)
doc2 = nlp("")
doc2.from_disk(file_path)
assert doc2[0].pos_ == "NOUN"
def test_issue3962(en_vocab):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
# fmt: off
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
# fmt: on
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
span2 = doc[1:5] # "jests at scars ,"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
# head set to itself, being the new artificial root
assert doc2[0].head.text == "jests"
assert doc2[0].dep_ == "dep"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests" # head set to the new artificial root
assert doc2[3].dep_ == "dep"
# We should still have 1 sentence
assert len(list(doc2.sents)) == 1
span3 = doc[6:9] # "never felt a"
doc3 = span3.as_doc()
doc3_json = doc3.to_json()
assert doc3_json
assert doc3[0].head.text == "felt"
assert doc3[0].dep_ == "neg"
assert doc3[1].head.text == "felt"
assert doc3[1].dep_ == "ROOT"
assert doc3[2].head.text == "felt" # head set to ancestor
assert doc3[2].dep_ == "dep"
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
assert len(list(doc3.sents)) == 1
def test_issue3962_long(en_vocab):
""" Ensure that as_doc does not result in out-of-bound access of tokens.
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
# fmt: off
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
# fmt: on
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
span2 = two_sent_doc[1:7] # "jests at scars. They never"
doc2 = span2.as_doc()
doc2_json = doc2.to_json()
assert doc2_json
# head set to itself, being the new artificial root (in sentence 1)
assert doc2[0].head.text == "jests"
assert doc2[0].dep_ == "ROOT"
assert doc2[1].head.text == "jests"
assert doc2[1].dep_ == "prep"
assert doc2[2].head.text == "at"
assert doc2[2].dep_ == "pobj"
assert doc2[3].head.text == "jests"
assert doc2[3].dep_ == "punct"
# head set to itself, being the new artificial root (in sentence 2)
assert doc2[4].head.text == "They"
assert doc2[4].dep_ == "dep"
# head set to the new artificial head (in sentence 2)
assert doc2[4].head.text == "They"
assert doc2[4].dep_ == "dep"
# We should still have 2 sentences
sents = list(doc2.sents)
assert len(sents) == 2
assert sents[0].text == "jests at scars ."
assert sents[1].text == "They never"
def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
"""
matcher = PhraseMatcher(en_vocab)
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
matches = matcher(doc)
assert len(matches) == 2
# We should have a match for each of the two rules
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
assert "A" in found_ids
assert "B" in found_ids