diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 67966f70e..8b998d216 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -23,6 +23,7 @@ def test_issue2070(): assert len(doc) == 11 +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() @@ -134,6 +135,7 @@ def test_issue2464(en_vocab): assert len(matches) == 3 +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 5d504a9c6..768ae33fe 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls): assert doc[0].like_num +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue2800(): """Test issue that arises when too many labels are added to NER model. Used to cause segfault. """ nlp = English() train_data = [] - train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]) + train_data.extend( + [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] + ) entity_types = [str(i) for i in range(1000)] ner = nlp.create_pipe("ner") nlp.add_pipe(ner) diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index 1aceba68f..1d5bfcb92 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -88,6 +88,7 @@ def test_issue3199(): assert list(doc[0:3].noun_chunks) == [] +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py new file mode 100644 index 000000000..5e2ee902c --- /dev/null +++ b/spacy/tests/regression/test_issue3501-4000.py @@ -0,0 +1,472 @@ +import pytest +from spacy.language import Language +from spacy.vocab import Vocab +from spacy.pipeline import EntityRuler, DependencyParser +from spacy.pipeline.defaults import default_parser +from spacy import displacy, load +from spacy.displacy import parse_deps +from spacy.tokens import Doc, Token +from spacy.matcher import Matcher, PhraseMatcher +from spacy.errors import MatchPatternError +from spacy.util import minibatch +from spacy.gold import Example +from spacy.lang.hi import Hindi +from spacy.lang.es import Spanish +from spacy.lang.en import English +from spacy.attrs import IS_ALPHA +from thinc.api import compounding +import spacy +import srsly +import numpy + +from ..util import make_tempdir, get_doc + + +@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms + assert tok.is_stop + + +def test_issue_3526_1(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler_bytes = ruler.to_bytes() + assert len(ruler) == len(patterns) + assert len(ruler.labels) == 4 + assert ruler.overwrite + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(ruler_bytes) + assert len(new_ruler) == len(ruler) + assert len(new_ruler.labels) == 4 + assert new_ruler.overwrite == ruler.overwrite + assert new_ruler.ent_id_sep == ruler.ent_id_sep + + +def test_issue_3526_2(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + bytes_old_style = srsly.msgpack_dumps(ruler.patterns) + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(bytes_old_style) + assert len(new_ruler) == len(ruler) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert new_ruler.overwrite is not ruler.overwrite + + +def test_issue_3526_3(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + with make_tempdir() as tmpdir: + out_file = tmpdir / "entity_ruler" + srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) + new_ruler = EntityRuler(nlp).from_disk(out_file) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert len(new_ruler) == len(ruler) + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue_3526_4(en_vocab): + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, overwrite_ents=True) + ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) + nlp.add_pipe(ruler) + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + ruler = nlp.get_pipe("entity_ruler") + assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert ruler.overwrite is True + nlp2 = load(tmpdir) + new_ruler = nlp2.get_pipe("entity_ruler") + assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert new_ruler.overwrite is True + + +def test_issue3531(): + """Test that displaCy renderer doesn't require "settings" key.""" + example_dep = { + "words": [ + {"text": "But", "tag": "CCONJ"}, + {"text": "Google", "tag": "PROPN"}, + {"text": "is", "tag": "VERB"}, + {"text": "starting", "tag": "VERB"}, + {"text": "from", "tag": "ADP"}, + {"text": "behind.", "tag": "ADV"}, + ], + "arcs": [ + {"start": 0, "end": 3, "label": "cc", "dir": "left"}, + {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "aux", "dir": "left"}, + {"start": 3, "end": 4, "label": "prep", "dir": "right"}, + {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, + ], + } + example_ent = { + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG"}], + } + dep_html = displacy.render(example_dep, style="dep", manual=True) + assert dep_html + ent_html = displacy.render(example_ent, style="ent", manual=True) + assert ent_html + + +def test_issue3540(en_vocab): + words = ["I", "live", "in", "NewYork", "right", "now"] + tensor = numpy.asarray( + [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], + dtype="f", + ) + doc = Doc(en_vocab, words=words) + doc.tensor = tensor + gold_text = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_1 = [token.vector for token in doc] + assert len(vectors_1) == len(doc) + + with doc.retokenize() as retokenizer: + heads = [(doc[3], 1), doc[2]] + attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} + retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) + + gold_text = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_2 = [token.vector for token in doc] + assert len(vectors_2) == len(doc) + assert vectors_1[0].tolist() == vectors_2[0].tolist() + assert vectors_1[1].tolist() == vectors_2[1].tolist() + assert vectors_1[2].tolist() == vectors_2[2].tolist() + assert vectors_1[4].tolist() == vectors_2[5].tolist() + assert vectors_1[5].tolist() == vectors_2[6].tolist() + + +def test_issue3549(en_vocab): + """Test that match pattern validation doesn't raise on empty errors.""" + matcher = Matcher(en_vocab, validate=True) + pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] + matcher.add("GOOD", [pattern]) + with pytest.raises(MatchPatternError): + matcher.add("BAD", [[{"X": "Y"}]]) + + +@pytest.mark.xfail +def test_issue3555(en_vocab): + """Test that custom extensions with default None don't break matcher.""" + Token.set_extension("issue3555", default=None) + matcher = Matcher(en_vocab) + pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["have", "apple"]) + matcher(doc) + + +def test_issue3611(): + """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + textcat = nlp.create_pipe( + "textcat", + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, + ) + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.begin_training(X=x_train, Y=y_train) + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update( + examples=batch, sgd=optimizer, drop=0.1, losses=losses, + ) + + +def test_issue3625(): + """Test that default punctuation rules applies to hindi unicode characters""" + nlp = Hindi() + doc = nlp("hi. how हुए. होटल, होटल") + expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] + assert [token.text for token in doc] == expected + + +def test_issue3803(): + """Test that spanish num-like tokens have True for like_num attribute.""" + nlp = Spanish() + text = "2 dos 1000 mil 12 doce" + doc = nlp(text) + + assert [t.like_num for t in doc] == [True, True, True, True, True, True] + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3830_no_subtok(): + """Test that the parser doesn't have subtok label if not learn_tokens""" + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + parser = DependencyParser(Vocab(), default_parser(), **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.begin_training(lambda: []) + assert "subtok" not in parser.labels + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3830_with_subtok(): + """Test that the parser does have subtok label if learn_tokens=True.""" + config = { + "learn_tokens": True, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + parser = DependencyParser(Vocab(), default_parser(), **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.begin_training(lambda: []) + assert "subtok" in parser.labels + + +def test_issue3839(en_vocab): + """Test that match IDs returned by the matcher are correct, are in the string """ + doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) + matcher = Matcher(en_vocab) + match_id = "PATTERN" + pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] + pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] + matcher.add(match_id, [pattern1]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + matcher = Matcher(en_vocab) + matcher.add(match_id, [pattern2]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + + +@pytest.mark.parametrize( + "sentence", + [ + "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", + "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", + "It was a missed assignment, but it shouldn't have resulted in a turnover ...", + ], +) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + doc = nlp(sentence) + count = 0 + for token in doc: + count += token.is_alpha + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + +def test_issue3879(en_vocab): + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + assert len(doc) == 5 + pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] + matcher = Matcher(en_vocab) + matcher.add("TEST", [pattern]) + assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue3880(): + """Test that `nlp.pipe()` works when an empty string ends the batch. + + Fixed in v7.0.5 of Thinc. + """ + texts = ["hello", "world", "", ""] + nlp = English() + nlp.add_pipe(nlp.create_pipe("parser")) + nlp.add_pipe(nlp.create_pipe("ner")) + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.get_pipe("parser").add_label("dep") + nlp.get_pipe("ner").add_label("PERSON") + nlp.get_pipe("tagger").add_label("NN") + nlp.begin_training() + for doc in nlp.pipe(texts): + pass + + +def test_issue3882(en_vocab): + """Test that displaCy doesn't serialize the doc.user_data when making a + copy of the Doc. + """ + doc = Doc(en_vocab, words=["Hello", "world"]) + doc.is_parsed = True + doc.user_data["test"] = set() + parse_deps(doc) + + +def test_issue3951(en_vocab): + """Test that combinations of optional rules are matched correctly.""" + matcher = Matcher(en_vocab) + pattern = [ + {"LOWER": "hello"}, + {"LOWER": "this", "OP": "?"}, + {"OP": "?"}, + {"LOWER": "world"}, + ] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) + matches = matcher(doc) + assert len(matches) == 0 + + +def test_issue3959(): + """ Ensure that a modified pos attribute is serialized correctly.""" + nlp = English() + doc = nlp( + "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" + ) + assert doc[0].pos_ == "" + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + with make_tempdir() as tmp_dir: + file_path = tmp_dir / "my_doc" + doc.to_disk(file_path) + doc2 = nlp("") + doc2.from_disk(file_path) + assert doc2[0].pos_ == "NOUN" + + +def test_issue3962(en_vocab): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] + heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] + deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = doc[1:5] # "jests at scars ," + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "dep" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" # head set to the new artificial root + assert doc2[3].dep_ == "dep" + # We should still have 1 sentence + assert len(list(doc2.sents)) == 1 + span3 = doc[6:9] # "never felt a" + doc3 = span3.as_doc() + doc3_json = doc3.to_json() + assert doc3_json + assert doc3[0].head.text == "felt" + assert doc3[0].dep_ == "neg" + assert doc3[1].head.text == "felt" + assert doc3[1].dep_ == "ROOT" + assert doc3[2].head.text == "felt" # head set to ancestor + assert doc3[2].dep_ == "dep" + # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" + assert len(list(doc3.sents)) == 1 + + +def test_issue3962_long(en_vocab): + """ Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] + heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] + deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = two_sent_doc[1:7] # "jests at scars. They never" + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root (in sentence 1) + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "ROOT" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" + assert doc2[3].dep_ == "punct" + # head set to itself, being the new artificial root (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # head set to the new artificial head (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # We should still have 2 sentences + sents = list(doc2.sents) + assert len(sents) == 2 + assert sents[0].text == "jests at scars ." + assert sents[1].text == "They never" + + +def test_issue3972(en_vocab): + """Test that the PhraseMatcher returns duplicates for duplicate match IDs. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) + matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) + doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) + matches = matcher(doc) + + assert len(matches) == 2 + + # We should have a match for each of the two rules + found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] + assert "A" in found_ids + assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py deleted file mode 100644 index 3d8ee9922..000000000 --- a/spacy/tests/regression/test_issue3521.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - - -@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) -def test_issue3521(en_tokenizer, word): - tok = en_tokenizer(word)[1] - # 'not' and 'would' should be stopwords, also in their abbreviated forms - assert tok.is_stop diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py deleted file mode 100644 index aa77028fb..000000000 --- a/spacy/tests/regression/test_issue3526.py +++ /dev/null @@ -1,85 +0,0 @@ -import pytest -from spacy.tokens import Span -from spacy.language import Language -from spacy.pipeline import EntityRuler -from spacy import load -import srsly - -from ..util import make_tempdir - - -@pytest.fixture -def patterns(): - return [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - - -@pytest.fixture -def add_ent(): - def add_ent_component(doc): - doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])] - return doc - - return add_ent_component - - -def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - ruler_bytes = ruler.to_bytes() - assert len(ruler) == len(patterns) - assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(ruler_bytes) - assert len(new_ruler) == len(ruler) - assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite - - -def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, overwrite_ents=True) - - ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}]) - nlp.add_pipe(ruler) - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - ruler = nlp.get_pipe("entity_ruler") - assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True - nlp2 = load(tmpdir) - new_ruler = nlp2.get_pipe("entity_ruler") - assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py deleted file mode 100644 index 4c65a5bfe..000000000 --- a/spacy/tests/regression/test_issue3531.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy import displacy - - -def test_issue3531(): - """Test that displaCy renderer doesn't require "settings" key.""" - example_dep = { - "words": [ - {"text": "But", "tag": "CCONJ"}, - {"text": "Google", "tag": "PROPN"}, - {"text": "is", "tag": "VERB"}, - {"text": "starting", "tag": "VERB"}, - {"text": "from", "tag": "ADP"}, - {"text": "behind.", "tag": "ADV"}, - ], - "arcs": [ - {"start": 0, "end": 3, "label": "cc", "dir": "left"}, - {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, - {"start": 2, "end": 3, "label": "aux", "dir": "left"}, - {"start": 3, "end": 4, "label": "prep", "dir": "right"}, - {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, - ], - } - example_ent = { - "text": "But Google is starting from behind.", - "ents": [{"start": 4, "end": 10, "label": "ORG"}], - } - dep_html = displacy.render(example_dep, style="dep", manual=True) - assert dep_html - ent_html = displacy.render(example_ent, style="ent", manual=True) - assert ent_html diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py deleted file mode 100644 index be9e04b0b..000000000 --- a/spacy/tests/regression/test_issue3540.py +++ /dev/null @@ -1,44 +0,0 @@ -from spacy.tokens import Doc - -import numpy as np - - -def test_issue3540(en_vocab): - - words = ["I", "live", "in", "NewYork", "right", "now"] - tensor = np.asarray( - [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], - dtype="f", - ) - doc = Doc(en_vocab, words=words) - doc.tensor = tensor - - gold_text = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.text for token in doc] == gold_text - - gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - - vectors_1 = [token.vector for token in doc] - assert len(vectors_1) == len(doc) - - with doc.retokenize() as retokenizer: - heads = [(doc[3], 1), doc[2]] - attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]} - retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) - - gold_text = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.text for token in doc] == gold_text - - gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - - vectors_2 = [token.vector for token in doc] - assert len(vectors_2) == len(doc) - - assert vectors_1[0].tolist() == vectors_2[0].tolist() - assert vectors_1[1].tolist() == vectors_2[1].tolist() - assert vectors_1[2].tolist() == vectors_2[2].tolist() - - assert vectors_1[4].tolist() == vectors_2[5].tolist() - assert vectors_1[5].tolist() == vectors_2[6].tolist() diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py deleted file mode 100644 index b3af59c2e..000000000 --- a/spacy/tests/regression/test_issue3549.py +++ /dev/null @@ -1,12 +0,0 @@ -import pytest -from spacy.matcher import Matcher -from spacy.errors import MatchPatternError - - -def test_issue3549(en_vocab): - """Test that match pattern validation doesn't raise on empty errors.""" - matcher = Matcher(en_vocab, validate=True) - pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] - matcher.add("GOOD", [pattern]) - with pytest.raises(MatchPatternError): - matcher.add("BAD", [[{"X": "Y"}]]) diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py deleted file mode 100644 index de047bcbc..000000000 --- a/spacy/tests/regression/test_issue3555.py +++ /dev/null @@ -1,14 +0,0 @@ -import pytest -from spacy.tokens import Doc, Token -from spacy.matcher import Matcher - - -@pytest.mark.xfail -def test_issue3555(en_vocab): - """Test that custom extensions with default None don't break matcher.""" - Token.set_extension("issue3555", default=None) - matcher = Matcher(en_vocab) - pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["have", "apple"]) - matcher(doc) diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py deleted file mode 100644 index ef189c446..000000000 --- a/spacy/tests/regression/test_issue3611.py +++ /dev/null @@ -1,45 +0,0 @@ -import spacy -from spacy.util import minibatch -from thinc.api import compounding -from spacy.gold import Example - - -def test_issue3611(): - """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - - nlp = spacy.blank("en") - - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - - # add a text categorizer component - textcat = nlp.create_pipe( - "textcat", - config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, - ) - - for label in unique_classes: - textcat.add_label(label) - nlp.add_pipe(textcat, last=True) - - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training(X=x_train, Y=y_train) - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py deleted file mode 100644 index 51561b3ac..000000000 --- a/spacy/tests/regression/test_issue3625.py +++ /dev/null @@ -1,9 +0,0 @@ -from spacy.lang.hi import Hindi - - -def test_issue3625(): - """Test that default punctuation rules applies to hindi unicode characters""" - nlp = Hindi() - doc = nlp("hi. how हुए. होटल, होटल") - expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] - assert [token.text for token in doc] == expected diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py deleted file mode 100644 index ab5250edf..000000000 --- a/spacy/tests/regression/test_issue3803.py +++ /dev/null @@ -1,10 +0,0 @@ -from spacy.lang.es import Spanish - - -def test_issue3803(): - """Test that spanish num-like tokens have True for like_num attribute.""" - nlp = Spanish() - text = "2 dos 1000 mil 12 doce" - doc = nlp(text) - - assert [t.like_num for t in doc] == [True, True, True, True, True, True] diff --git a/spacy/tests/regression/test_issue3830.py b/spacy/tests/regression/test_issue3830.py deleted file mode 100644 index 06b7893a7..000000000 --- a/spacy/tests/regression/test_issue3830.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.pipeline.pipes import DependencyParser -from spacy.vocab import Vocab - -from spacy.pipeline.defaults import default_parser - - -def test_issue3830_no_subtok(): - """Test that the parser doesn't have subtok label if not learn_tokens""" - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - parser = DependencyParser(Vocab(), default_parser(), **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.begin_training(lambda: []) - assert "subtok" not in parser.labels - - -def test_issue3830_with_subtok(): - """Test that the parser does have subtok label if learn_tokens=True.""" - config = { - "learn_tokens": True, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - parser = DependencyParser(Vocab(), default_parser(), **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.begin_training(lambda: []) - assert "subtok" in parser.labels diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py deleted file mode 100644 index 27b1f5f29..000000000 --- a/spacy/tests/regression/test_issue3839.py +++ /dev/null @@ -1,18 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string """ - doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) - matcher = Matcher(en_vocab) - match_id = "PATTERN" - pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] - pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] - matcher.add(match_id, [pattern1]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - matcher = Matcher(en_vocab) - matcher.add(match_id, [pattern2]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py deleted file mode 100644 index 0a851e869..000000000 --- a/spacy/tests/regression/test_issue3869.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -from spacy.attrs import IS_ALPHA -from spacy.lang.en import English - - -@pytest.mark.parametrize( - "sentence", - [ - "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", - "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", - "It was a missed assignment, but it shouldn't have resulted in a turnover ...", - ], -) -def test_issue3869(sentence): - """Test that the Doc's count_by function works consistently""" - nlp = English() - doc = nlp(sentence) - - count = 0 - for token in doc: - count += token.is_alpha - - assert count == doc.count_by(IS_ALPHA).get(1, 0) diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py deleted file mode 100644 index 8500c09aa..000000000 --- a/spacy/tests/regression/test_issue3879.py +++ /dev/null @@ -1,11 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3879(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) - assert len(doc) == 5 - pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] - matcher = Matcher(en_vocab) - matcher.add("TEST", [pattern]) - assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py deleted file mode 100644 index 6e8ab6f43..000000000 --- a/spacy/tests/regression/test_issue3880.py +++ /dev/null @@ -1,21 +0,0 @@ -from spacy.lang.en import English -import pytest - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_issue3880(): - """Test that `nlp.pipe()` works when an empty string ends the batch. - - Fixed in v7.0.5 of Thinc. - """ - texts = ["hello", "world", "", ""] - nlp = English() - nlp.add_pipe(nlp.create_pipe("parser")) - nlp.add_pipe(nlp.create_pipe("ner")) - nlp.add_pipe(nlp.create_pipe("tagger")) - nlp.get_pipe("parser").add_label("dep") - nlp.get_pipe("ner").add_label("PERSON") - nlp.get_pipe("tagger").add_label("NN") - nlp.begin_training() - for doc in nlp.pipe(texts): - pass diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py deleted file mode 100644 index fa616db1d..000000000 --- a/spacy/tests/regression/test_issue3882.py +++ /dev/null @@ -1,12 +0,0 @@ -from spacy.displacy import parse_deps -from spacy.tokens import Doc - - -def test_issue3882(en_vocab): - """Test that displaCy doesn't serialize the doc.user_data when making a - copy of the Doc. - """ - doc = Doc(en_vocab, words=["Hello", "world"]) - doc.is_parsed = True - doc.user_data["test"] = set() - parse_deps(doc) diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py deleted file mode 100644 index 6e4c9eeaa..000000000 --- a/spacy/tests/regression/test_issue3951.py +++ /dev/null @@ -1,17 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3951(en_vocab): - """Test that combinations of optional rules are matched correctly.""" - matcher = Matcher(en_vocab) - pattern = [ - {"LOWER": "hello"}, - {"LOWER": "this", "OP": "?"}, - {"OP": "?"}, - {"LOWER": "world"}, - ] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) - matches = matcher(doc) - assert len(matches) == 0 diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py deleted file mode 100644 index 7db28a31f..000000000 --- a/spacy/tests/regression/test_issue3959.py +++ /dev/null @@ -1,26 +0,0 @@ -from spacy.lang.en import English -from ..util import make_tempdir - - -def test_issue3959(): - """ Ensure that a modified pos attribute is serialized correctly.""" - nlp = English() - doc = nlp( - "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" - ) - assert doc[0].pos_ == "" - - doc[0].pos_ = "NOUN" - assert doc[0].pos_ == "NOUN" - - # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True - - with make_tempdir() as tmp_dir: - file_path = tmp_dir / "my_doc" - doc.to_disk(file_path) - - doc2 = nlp("") - doc2.from_disk(file_path) - - assert doc2[0].pos_ == "NOUN" diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py deleted file mode 100644 index 971c9b08e..000000000 --- a/spacy/tests/regression/test_issue3962.py +++ /dev/null @@ -1,117 +0,0 @@ -import pytest - -from ..util import get_doc - - -@pytest.fixture -def doc(en_tokenizer): - text = "He jests at scars, that never felt a wound." - heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3] - deps = [ - "nsubj", - "ccomp", - "prep", - "pobj", - "punct", - "nsubj", - "neg", - "ROOT", - "det", - "dobj", - "punct", - ] - tokens = en_tokenizer(text) - return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - - -def test_issue3962(doc): - """ Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - span2 = doc[1:5] # "jests at scars ," - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - - assert ( - doc2[0].head.text == "jests" - ) # head set to itself, being the new artificial root - assert doc2[0].dep_ == "dep" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" # head set to the new artificial root - assert doc2[3].dep_ == "dep" - - # We should still have 1 sentence - assert len(list(doc2.sents)) == 1 - - span3 = doc[6:9] # "never felt a" - doc3 = span3.as_doc() - doc3_json = doc3.to_json() - assert doc3_json - - assert doc3[0].head.text == "felt" - assert doc3[0].dep_ == "neg" - assert doc3[1].head.text == "felt" - assert doc3[1].dep_ == "ROOT" - assert doc3[2].head.text == "felt" # head set to ancestor - assert doc3[2].dep_ == "dep" - - # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" - assert len(list(doc3.sents)) == 1 - - -@pytest.fixture -def two_sent_doc(en_tokenizer): - text = "He jests at scars. They never felt a wound." - heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3] - deps = [ - "nsubj", - "ROOT", - "prep", - "pobj", - "punct", - "nsubj", - "neg", - "ROOT", - "det", - "dobj", - "punct", - ] - tokens = en_tokenizer(text) - return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - - -def test_issue3962_long(two_sent_doc): - """ Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - span2 = two_sent_doc[1:7] # "jests at scars. They never" - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - - assert ( - doc2[0].head.text == "jests" - ) # head set to itself, being the new artificial root (in sentence 1) - assert doc2[0].dep_ == "ROOT" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" - assert doc2[3].dep_ == "punct" - assert ( - doc2[4].head.text == "They" - ) # head set to itself, being the new artificial root (in sentence 2) - assert doc2[4].dep_ == "dep" - assert ( - doc2[4].head.text == "They" - ) # head set to the new artificial head (in sentence 2) - assert doc2[4].dep_ == "dep" - - # We should still have 2 sentences - sents = list(doc2.sents) - assert len(sents) == 2 - assert sents[0].text == "jests at scars ." - assert sents[1].text == "They never" diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py deleted file mode 100644 index fe5388950..000000000 --- a/spacy/tests/regression/test_issue3972.py +++ /dev/null @@ -1,19 +0,0 @@ -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -def test_issue3972(en_vocab): - """Test that the PhraseMatcher returns duplicates for duplicate match IDs. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) - matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) - doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) - matches = matcher(doc) - - assert len(matches) == 2 - - # We should have a match for each of the two rules - found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] - assert "A" in found_ids - assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py new file mode 100644 index 000000000..2981c6428 --- /dev/null +++ b/spacy/tests/regression/test_issue4001-4500.py @@ -0,0 +1,469 @@ +import pytest +from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe +from spacy.pipeline.defaults import default_ner +from spacy.matcher import PhraseMatcher, Matcher +from spacy.tokens import Doc, Span, DocBin +from spacy.gold import Example, Corpus +from spacy.gold.converters import json2docs +from spacy.vocab import Vocab +from spacy.lang.en import English +from spacy.util import minibatch, ensure_path, load_model +from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex +from spacy.tokenizer import Tokenizer +from spacy.lang.el import Greek +from spacy.language import Language +import spacy +from thinc.api import compounding +from collections import defaultdict + +from ..util import make_tempdir + + +def test_issue4002(en_vocab): + """Test that the PhraseMatcher can match on overwritten NORM attributes. + """ + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern1 = Doc(en_vocab, words=["c", "d"]) + assert [t.norm_ for t in pattern1] == ["c", "d"] + matcher.add("TEST", [pattern1]) + doc = Doc(en_vocab, words=["a", "b", "c", "d"]) + assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] + matches = matcher(doc) + assert len(matches) == 1 + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern2 = Doc(en_vocab, words=["1", "2"]) + pattern2[0].norm_ = "c" + pattern2[1].norm_ = "d" + assert [t.norm_ for t in pattern2] == ["c", "d"] + matcher.add("TEST", [pattern2]) + matches = matcher(doc) + assert len(matches) == 1 + + +def test_issue4030(): + """ Test whether textcat works fine with empty doc """ + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + textcat = nlp.create_pipe( + "textcat", + config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, + ) + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.begin_training() + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update( + examples=batch, sgd=optimizer, drop=0.1, losses=losses, + ) + # processing of an empty doc should result in 0.0 for all categories + doc = nlp("") + assert doc.cats["offensive"] == 0.0 + assert doc.cats["inoffensive"] == 0.0 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4042(): + """Test that serialization of an EntityRuler before NER works fine.""" + nlp = English() + + # add ner pipe + ner = nlp.create_pipe("ner") + ner.add_label("SOME_LABEL") + nlp.add_pipe(ner) + nlp.begin_training() + + # Add entity ruler + ruler = EntityRuler(nlp) + patterns = [ + {"label": "MY_ORG", "pattern": "Apple"}, + {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, + ] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler, before="ner") # works fine with "after" + doc1 = nlp("What do you think about Apple ?") + assert doc1.ents[0].label_ == "MY_ORG" + + with make_tempdir() as d: + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + + nlp2 = load_model(output_dir) + doc2 = nlp2("What do you think about Apple ?") + assert doc2.ents[0].label_ == "MY_ORG" + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4042_bug2(): + """ + Test that serialization of an NER works fine when new labels were added. + This is the second bug of two bugs underlying the issue 4042. + """ + nlp1 = English() + vocab = nlp1.vocab + + # add ner pipe + ner1 = nlp1.create_pipe("ner") + ner1.add_label("SOME_LABEL") + nlp1.add_pipe(ner1) + nlp1.begin_training() + + # add a new label to the doc + doc1 = nlp1("What do you think about Apple ?") + assert len(ner1.labels) == 1 + assert "SOME_LABEL" in ner1.labels + apple_ent = Span(doc1, 5, 6, label="MY_ORG") + doc1.ents = list(doc1.ents) + [apple_ent] + + # reapply the NER - at this point it should resize itself + ner1(doc1) + assert len(ner1.labels) == 2 + assert "SOME_LABEL" in ner1.labels + assert "MY_ORG" in ner1.labels + + with make_tempdir() as d: + # assert IO goes fine + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + ner1.to_disk(output_dir) + + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + ner2 = EntityRecognizer(vocab, default_ner(), **config) + ner2.from_disk(output_dir) + assert len(ner2.labels) == 2 + + +def test_issue4054(en_vocab): + """Test that a new blank model can be made with a vocab from file, + and that serialization does not drop the language at any point.""" + nlp1 = English() + vocab1 = nlp1.vocab + with make_tempdir() as d: + vocab_dir = ensure_path(d / "vocab") + if not vocab_dir.exists(): + vocab_dir.mkdir() + vocab1.to_disk(vocab_dir) + vocab2 = Vocab().from_disk(vocab_dir) + print("lang", vocab2.lang) + nlp2 = spacy.blank("en", vocab=vocab2) + nlp_dir = ensure_path(d / "nlp") + if not nlp_dir.exists(): + nlp_dir.mkdir() + nlp2.to_disk(nlp_dir) + nlp3 = load_model(nlp_dir) + assert nlp3.lang == "en" + + +def test_issue4120(en_vocab): + """Test that matches without a final {OP: ?} token are returned.""" + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) + doc1 = Doc(en_vocab, words=["a"]) + assert len(matcher(doc1)) == 1 # works + doc2 = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc2)) == 2 # fixed + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) + doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc3)) == 2 # works + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) + doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc4)) == 3 # fixed + + +def test_issue4133(en_vocab): + nlp = English() + vocab_bytes = nlp.vocab.to_bytes() + words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] + pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] + doc = Doc(en_vocab, words=words) + for i, token in enumerate(doc): + token.pos_ = pos[i] + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + doc_bytes = doc.to_bytes() + vocab = Vocab() + vocab = vocab.from_bytes(vocab_bytes) + doc = Doc(vocab).from_bytes(doc_bytes) + actual = [] + for token in doc: + actual.append(token.pos_) + assert actual == pos + + +def test_issue4190(): + def customize_tokenizer(nlp): + prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) + suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) + infix_re = compile_infix_regex(nlp.Defaults.infixes) + # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') + exceptions = { + k: v + for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() + if not (len(k) == 2 and k[1] == ".") + } + new_tokenizer = Tokenizer( + nlp.vocab, + exceptions, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=nlp.tokenizer.token_match, + ) + nlp.tokenizer = new_tokenizer + + test_string = "Test c." + # Load default language + nlp_1 = English() + doc_1a = nlp_1(test_string) + result_1a = [token.text for token in doc_1a] # noqa: F841 + # Modify tokenizer + customize_tokenizer(nlp_1) + doc_1b = nlp_1(test_string) + result_1b = [token.text for token in doc_1b] + # Save and Reload + with make_tempdir() as model_dir: + nlp_1.to_disk(model_dir) + nlp_2 = load_model(model_dir) + # This should be the modified tokenizer + doc_2 = nlp_2(test_string) + result_2 = [token.text for token in doc_2] + assert result_1b == result_2 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4267(): + """ Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.create_pipe("ner") + ner.add_label("PEOPLE") + nlp.add_pipe(ner) + nlp.begin_training() + assert "ner" in nlp.pipe_names + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.is_nered + for token in doc1: + assert token.ent_iob == 2 + # add entity ruler and run again + ruler = EntityRuler(nlp) + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.is_nered + for token in doc2: + assert token.ent_iob == 2 + + +def test_issue4272(): + """Test that lookup table can be accessed from Token.lemma if no POS tags + are available.""" + nlp = Greek() + doc = nlp("Χθες") + assert doc[0].lemma_ + + +def test_multiple_predictions(): + class DummyPipe(Pipe): + def __init__(self): + self.model = "dummy_model" + + def predict(self, docs): + return ([1, 2, 3], [4, 5, 6]) + + def set_annotations(self, docs, scores, tensors=None): + return docs + + nlp = Language() + doc = nlp.make_doc("foo") + dummy_pipe = DummyPipe() + dummy_pipe(doc) + + +@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor") +def test_issue4313(): + """ This should not crash or exit with some strange error code """ + beam_width = 16 + beam_density = 0.0001 + nlp = English() + config = { + "learn_tokens": False, + "min_action_freq": 30, + "beam_width": 1, + "beam_update_prob": 1.0, + } + ner = EntityRecognizer(nlp.vocab, default_ner(), **config) + ner.add_label("SOME_LABEL") + ner.begin_training([]) + nlp.add_pipe(ner) + + # add a new label to the doc + doc = nlp("What do you think about Apple ?") + assert len(ner.labels) == 1 + assert "SOME_LABEL" in ner.labels + apple_ent = Span(doc, 5, 6, label="MY_ORG") + doc.ents = list(doc.ents) + [apple_ent] + + # ensure the beam_parse still works with the new label + docs = [doc] + beams = nlp.entity.beam_parse( + docs, beam_width=beam_width, beam_density=beam_density + ) + + for doc, beam in zip(docs, beams): + entity_scores = defaultdict(float) + for score, ents in nlp.entity.moves.get_beam_parses(beam): + for start, end, label in ents: + entity_scores[(start, end, label)] += score + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4348(): + """Test that training the tagger with empty data, doesn't throw errors""" + nlp = English() + example = Example.from_dict(nlp.make_doc(""), {"tags": []}) + TRAIN_DATA = [example, example] + tagger = nlp.create_pipe("tagger") + nlp.add_pipe(tagger) + optimizer = nlp.begin_training() + for i in range(5): + losses = {} + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + +def test_issue4367(): + """Test that docbin init goes well""" + DocBin() + DocBin(attrs=["LEMMA"]) + DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) + + +def test_issue4373(): + """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" + matcher = Matcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + matcher = PhraseMatcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + + +def test_issue4402(): + json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], + } + nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] + with make_tempdir() as tmpdir: + output_file = tmpdir / "test4402.spacy" + docs = json2docs([json_data]) + data = DocBin(docs=docs, attrs=attrs).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) + + train_data = list(corpus.train_dataset(nlp)) + assert len(train_data) == 2 + + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py deleted file mode 100644 index 3ac26d3ab..000000000 --- a/spacy/tests/regression/test_issue4002.py +++ /dev/null @@ -1,23 +0,0 @@ -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -def test_issue4002(en_vocab): - """Test that the PhraseMatcher can match on overwritten NORM attributes. - """ - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern1 = Doc(en_vocab, words=["c", "d"]) - assert [t.norm_ for t in pattern1] == ["c", "d"] - matcher.add("TEST", [pattern1]) - doc = Doc(en_vocab, words=["a", "b", "c", "d"]) - assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] - matches = matcher(doc) - assert len(matches) == 1 - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern2 = Doc(en_vocab, words=["1", "2"]) - pattern2[0].norm_ = "c" - pattern2[1].norm_ = "d" - assert [t.norm_ for t in pattern2] == ["c", "d"] - matcher.add("TEST", [pattern2]) - matches = matcher(doc) - assert len(matches) == 1 diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py deleted file mode 100644 index e40565501..000000000 --- a/spacy/tests/regression/test_issue4030.py +++ /dev/null @@ -1,50 +0,0 @@ -import spacy -from spacy.util import minibatch -from thinc.api import compounding -from spacy.gold import Example - - -def test_issue4030(): - """ Test whether textcat works fine with empty doc """ - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - - nlp = spacy.blank("en") - - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - - # add a text categorizer component - textcat = nlp.create_pipe( - "textcat", - config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2}, - ) - - for label in unique_classes: - textcat.add_label(label) - nlp.add_pipe(textcat, last=True) - - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.begin_training() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update( - examples=batch, sgd=optimizer, drop=0.1, losses=losses, - ) - - # processing of an empty doc should result in 0.0 for all categories - doc = nlp("") - assert doc.cats["offensive"] == 0.0 - assert doc.cats["inoffensive"] == 0.0 diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py deleted file mode 100644 index f47290b92..000000000 --- a/spacy/tests/regression/test_issue4042.py +++ /dev/null @@ -1,85 +0,0 @@ -import spacy -from spacy.pipeline import EntityRecognizer, EntityRuler -from spacy.lang.en import English -from spacy.tokens import Span -from spacy.util import ensure_path -from spacy.pipeline.defaults import default_ner - -from ..util import make_tempdir - - -def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" - nlp = English() - - # add ner pipe - ner = nlp.create_pipe("ner") - ner.add_label("SOME_LABEL") - nlp.add_pipe(ner) - nlp.begin_training() - - # Add entity ruler - ruler = EntityRuler(nlp) - patterns = [ - {"label": "MY_ORG", "pattern": "Apple"}, - {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, - ] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler, before="ner") # works fine with "after" - doc1 = nlp("What do you think about Apple ?") - assert doc1.ents[0].label_ == "MY_ORG" - - with make_tempdir() as d: - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - - nlp2 = spacy.load(output_dir) - doc2 = nlp2("What do you think about Apple ?") - assert doc2.ents[0].label_ == "MY_ORG" - - -def test_issue4042_bug2(): - """ - Test that serialization of an NER works fine when new labels were added. - This is the second bug of two bugs underlying the issue 4042. - """ - nlp1 = English() - vocab = nlp1.vocab - - # add ner pipe - ner1 = nlp1.create_pipe("ner") - ner1.add_label("SOME_LABEL") - nlp1.add_pipe(ner1) - nlp1.begin_training() - - # add a new label to the doc - doc1 = nlp1("What do you think about Apple ?") - assert len(ner1.labels) == 1 - assert "SOME_LABEL" in ner1.labels - apple_ent = Span(doc1, 5, 6, label="MY_ORG") - doc1.ents = list(doc1.ents) + [apple_ent] - - # reapply the NER - at this point it should resize itself - ner1(doc1) - assert len(ner1.labels) == 2 - assert "SOME_LABEL" in ner1.labels - assert "MY_ORG" in ner1.labels - - with make_tempdir() as d: - # assert IO goes fine - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - ner1.to_disk(output_dir) - - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - ner2 = EntityRecognizer(vocab, default_ner(), **config) - ner2.from_disk(output_dir) - assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py deleted file mode 100644 index c52ded395..000000000 --- a/spacy/tests/regression/test_issue4054.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy.vocab import Vocab -import spacy -from spacy.lang.en import English -from spacy.util import ensure_path - -from ..util import make_tempdir - - -def test_issue4054(en_vocab): - """Test that a new blank model can be made with a vocab from file, - and that serialization does not drop the language at any point.""" - nlp1 = English() - vocab1 = nlp1.vocab - - with make_tempdir() as d: - vocab_dir = ensure_path(d / "vocab") - if not vocab_dir.exists(): - vocab_dir.mkdir() - vocab1.to_disk(vocab_dir) - - vocab2 = Vocab().from_disk(vocab_dir) - print("lang", vocab2.lang) - nlp2 = spacy.blank("en", vocab=vocab2) - - nlp_dir = ensure_path(d / "nlp") - if not nlp_dir.exists(): - nlp_dir.mkdir() - nlp2.to_disk(nlp_dir) - nlp3 = spacy.load(nlp_dir) - assert nlp3.lang == "en" diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py deleted file mode 100644 index 4849aa238..000000000 --- a/spacy/tests/regression/test_issue4120.py +++ /dev/null @@ -1,23 +0,0 @@ -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue4120(en_vocab): - """Test that matches without a final {OP: ?} token are returned.""" - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) - doc1 = Doc(en_vocab, words=["a"]) - assert len(matcher(doc1)) == 1 # works - - doc2 = Doc(en_vocab, words=["a", "b", "c"]) - assert len(matcher(doc2)) == 2 # fixed - - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) - doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc3)) == 2 # works - - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) - doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc4)) == 3 # fixed diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py deleted file mode 100644 index a726806d7..000000000 --- a/spacy/tests/regression/test_issue4133.py +++ /dev/null @@ -1,28 +0,0 @@ -from spacy.lang.en import English -from spacy.tokens import Doc -from spacy.vocab import Vocab - - -def test_issue4133(en_vocab): - nlp = English() - vocab_bytes = nlp.vocab.to_bytes() - words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] - pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] - doc = Doc(en_vocab, words=words) - for i, token in enumerate(doc): - token.pos_ = pos[i] - - # usually this is already True when starting from proper models instead of blank English - doc.is_tagged = True - - doc_bytes = doc.to_bytes() - - vocab = Vocab() - vocab = vocab.from_bytes(vocab_bytes) - doc = Doc(vocab).from_bytes(doc_bytes) - - actual = [] - for token in doc: - actual.append(token.pos_) - - assert actual == pos diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py deleted file mode 100644 index 97d532d2a..000000000 --- a/spacy/tests/regression/test_issue4190.py +++ /dev/null @@ -1,46 +0,0 @@ -from spacy.lang.en import English -from spacy.tokenizer import Tokenizer -from spacy import util - -from ..util import make_tempdir - - -def test_issue4190(): - test_string = "Test c." - # Load default language - nlp_1 = English() - doc_1a = nlp_1(test_string) - result_1a = [token.text for token in doc_1a] # noqa: F841 - # Modify tokenizer - customize_tokenizer(nlp_1) - doc_1b = nlp_1(test_string) - result_1b = [token.text for token in doc_1b] - # Save and Reload - with make_tempdir() as model_dir: - nlp_1.to_disk(model_dir) - nlp_2 = util.load_model(model_dir) - # This should be the modified tokenizer - doc_2 = nlp_2(test_string) - result_2 = [token.text for token in doc_2] - assert result_1b == result_2 - - -def customize_tokenizer(nlp): - prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes) - infix_re = util.compile_infix_regex(nlp.Defaults.infixes) - # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') - exceptions = { - k: v - for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() - if not (len(k) == 2 and k[1] == ".") - } - new_tokenizer = Tokenizer( - nlp.vocab, - exceptions, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=nlp.tokenizer.token_match, - ) - nlp.tokenizer = new_tokenizer diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py deleted file mode 100644 index 891f03b30..000000000 --- a/spacy/tests/regression/test_issue4267.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - - -def test_issue4267(): - """ Test that running an entity_ruler after ner gives consistent results""" - nlp = English() - ner = nlp.create_pipe("ner") - ner.add_label("PEOPLE") - nlp.add_pipe(ner) - nlp.begin_training() - - assert "ner" in nlp.pipe_names - - # assert that we have correct IOB annotations - doc1 = nlp("hi") - assert doc1.is_nered - for token in doc1: - assert token.ent_iob == 2 - - # add entity ruler and run again - ruler = EntityRuler(nlp) - patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] - - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - assert "entity_ruler" in nlp.pipe_names - assert "ner" in nlp.pipe_names - - # assert that we still have correct IOB annotations - doc2 = nlp("hi") - assert doc2.is_nered - for token in doc2: - assert token.ent_iob == 2 diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py deleted file mode 100644 index 4bac97a44..000000000 --- a/spacy/tests/regression/test_issue4272.py +++ /dev/null @@ -1,9 +0,0 @@ -from spacy.lang.el import Greek - - -def test_issue4272(): - """Test that lookup table can be accessed from Token.lemma if no POS tags - are available.""" - nlp = Greek() - doc = nlp("Χθες") - assert doc[0].lemma_ diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py deleted file mode 100644 index ffbc41226..000000000 --- a/spacy/tests/regression/test_issue4278.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -from spacy.language import Language -from spacy.pipeline import Pipe - - -class DummyPipe(Pipe): - def __init__(self): - self.model = "dummy_model" - - def predict(self, docs): - return ([1, 2, 3], [4, 5, 6]) - - def set_annotations(self, docs, scores, tensors=None): - return docs - - -@pytest.fixture -def nlp(): - return Language() - - -def test_multiple_predictions(nlp): - doc = nlp.make_doc("foo") - dummy_pipe = DummyPipe() - dummy_pipe(doc) diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py deleted file mode 100644 index 3bddc26ca..000000000 --- a/spacy/tests/regression/test_issue4313.py +++ /dev/null @@ -1,47 +0,0 @@ -from collections import defaultdict - -import pytest - -from spacy.pipeline.defaults import default_ner -from spacy.pipeline import EntityRecognizer - -from spacy.lang.en import English -from spacy.tokens import Span - - -# skipped after removing Beam stuff during the Example/GoldParse refactor -@pytest.mark.skip -def test_issue4313(): - """ This should not crash or exit with some strange error code """ - beam_width = 16 - beam_density = 0.0001 - nlp = English() - config = { - "learn_tokens": False, - "min_action_freq": 30, - "beam_width": 1, - "beam_update_prob": 1.0, - } - ner = EntityRecognizer(nlp.vocab, default_ner(), **config) - ner.add_label("SOME_LABEL") - ner.begin_training([]) - nlp.add_pipe(ner) - - # add a new label to the doc - doc = nlp("What do you think about Apple ?") - assert len(ner.labels) == 1 - assert "SOME_LABEL" in ner.labels - apple_ent = Span(doc, 5, 6, label="MY_ORG") - doc.ents = list(doc.ents) + [apple_ent] - - # ensure the beam_parse still works with the new label - docs = [doc] - beams = nlp.entity.beam_parse( - docs, beam_width=beam_width, beam_density=beam_density - ) - - for doc, beam in zip(docs, beams): - entity_scores = defaultdict(float) - for score, ents in nlp.entity.moves.get_beam_parses(beam): - for start, end, label in ents: - entity_scores[(start, end, label)] += score diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py deleted file mode 100644 index 06b03df24..000000000 --- a/spacy/tests/regression/test_issue4348.py +++ /dev/null @@ -1,24 +0,0 @@ -from spacy.gold import Example -from spacy.lang.en import English -from spacy.util import minibatch -from thinc.api import compounding -import pytest - - -@pytest.mark.filterwarnings("ignore::UserWarning") -def test_issue4348(): - """Test that training the tagger with empty data, doesn't throw errors""" - - nlp = English() - example = Example.from_dict(nlp.make_doc(""), {"tags": []}) - TRAIN_DATA = [example, example] - - tagger = nlp.create_pipe("tagger") - nlp.add_pipe(tagger) - - optimizer = nlp.begin_training() - for i in range(5): - losses = {} - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py deleted file mode 100644 index 917847a05..000000000 --- a/spacy/tests/regression/test_issue4367.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.tokens import DocBin - - -def test_issue4367(): - """Test that docbin init goes well""" - DocBin() - DocBin(attrs=["LEMMA"]) - DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py deleted file mode 100644 index dbde1624e..000000000 --- a/spacy/tests/regression/test_issue4373.py +++ /dev/null @@ -1,10 +0,0 @@ -from spacy.matcher import Matcher, PhraseMatcher -from spacy.vocab import Vocab - - -def test_issue4373(): - """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" - matcher = Matcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - matcher = PhraseMatcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py deleted file mode 100644 index 9c596aaf6..000000000 --- a/spacy/tests/regression/test_issue4402.py +++ /dev/null @@ -1,98 +0,0 @@ -from spacy.gold import Corpus -from spacy.lang.en import English - -from ..util import make_tempdir -from ...gold.converters import json2docs -from ...tokens import DocBin - - -def test_issue4402(): - nlp = English() - attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] - with make_tempdir() as tmpdir: - output_file = tmpdir / "test4402.spacy" - docs = json2docs([json_data]) - data = DocBin(docs=docs, attrs=attrs).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) - corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file)) - - train_data = list(corpus.train_dataset(nlp)) - assert len(train_data) == 2 - - split_train_data = [] - for eg in train_data: - split_train_data.extend(eg.split_sents()) - assert len(split_train_data) == 4 - - -json_data = { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], -} diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py new file mode 100644 index 000000000..01d7a1dbb --- /dev/null +++ b/spacy/tests/regression/test_issue4501-5000.py @@ -0,0 +1,288 @@ +import pytest +from mock import Mock +from spacy.pipeline import EntityRuler +from spacy.matcher import DependencyMatcher +from spacy.tokens import Doc, Span, DocBin +from spacy.gold import Example +from spacy.gold.converters.conllu2docs import conllu2docs +from spacy.lang.en import English +from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab +from spacy.language import Language +from spacy.util import ensure_path, load_model_from_path +import numpy +import pickle + +from ..util import get_doc, make_tempdir + + +def test_issue4528(en_vocab): + """Test that user_data is correctly serialized in DocBin.""" + doc = Doc(en_vocab, words=["hello", "world"]) + doc.user_data["foo"] = "bar" + # This is how extension attribute values are stored in the user data + doc.user_data[("._.", "foo", None, None)] = "bar" + doc_bin = DocBin(store_user_data=True) + doc_bin.add(doc) + doc_bin_bytes = doc_bin.to_bytes() + new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) + new_doc = list(new_doc_bin.get_docs(en_vocab))[0] + assert new_doc.user_data["foo"] == "bar" + assert new_doc.user_data[("._.", "foo", None, None)] == "bar" + + +@pytest.mark.parametrize( + "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] +) +def test_gold_misaligned(en_tokenizer, text, words): + doc = en_tokenizer(text) + Example.from_dict(doc, {"words": words}) + + +def test_issue4590(en_vocab): + """Test that matches param in on_match method are the same as matches run with no on_match method""" + pattern = [ + {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + ] + + on_match = Mock() + matcher = DependencyMatcher(en_vocab) + matcher.add("pattern", on_match, pattern) + text = "The quick brown fox jumped over the lazy fox" + heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] + deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] + doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) + matches = matcher(doc) + on_match_args = on_match.call_args + assert on_match_args[0][3] == matches + + +def test_issue4651_with_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +def test_issue4651_without_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialize correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + not specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + ruler = EntityRuler(nlp) + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) + nlp_reloaded.add_pipe(ruler_reloaded) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +def test_issue4665(): + """ + conllu2json should not raise an exception if the HEAD column contains an + underscore + """ + input_data = """ +1 [ _ PUNCT -LRB- _ _ punct _ _ +2 This _ DET DT _ _ det _ _ +3 killing _ NOUN NN _ _ nsubj _ _ +4 of _ ADP IN _ _ case _ _ +5 a _ DET DT _ _ det _ _ +6 respected _ ADJ JJ _ _ amod _ _ +7 cleric _ NOUN NN _ _ nmod _ _ +8 will _ AUX MD _ _ aux _ _ +9 be _ AUX VB _ _ aux _ _ +10 causing _ VERB VBG _ _ root _ _ +11 us _ PRON PRP _ _ iobj _ _ +12 trouble _ NOUN NN _ _ dobj _ _ +13 for _ ADP IN _ _ case _ _ +14 years _ NOUN NNS _ _ nmod _ _ +15 to _ PART TO _ _ mark _ _ +16 come _ VERB VB _ _ acl _ _ +17 . _ PUNCT . _ _ punct _ _ +18 ] _ PUNCT -RRB- _ _ punct _ _ +""" + conllu2docs(input_data) + + +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) + assert kb.get_size_entities() == 1 + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.dump(str(file_path)) + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + kb2.load_bulk(str(file_path)) + assert kb2.get_size_entities() == 1 + + +def test_issue4707(): + """Tests that disabled component names are also excluded from nlp.from_disk + by default when loading a model. + """ + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(nlp.create_pipe("entity_ruler")) + assert nlp.pipe_names == ["sentencizer", "entity_ruler"] + exclude = ["tokenizer", "sentencizer"] + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir, exclude=exclude) + new_nlp = load_model_from_path(tmpdir, disable=exclude) + assert "sentencizer" not in new_nlp.pipe_names + assert "entity_ruler" in new_nlp.pipe_names + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4725_1(): + """ Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["min_action_freq"] == 342 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["min_action_freq"] == 342 + + +@pytest.mark.filterwarnings("ignore::UserWarning") +def test_issue4725_2(): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + nlp = English(vocab=vocab) + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + nlp.begin_training() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + + +def test_issue4849(): + nlp = English() + ruler = EntityRuler( + nlp, + patterns=[ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, + ], + phrase_matcher_attr="LOWER", + ) + nlp.add_pipe(ruler) + text = """ + The left is starting to take aim at Democratic front-runner Joe Biden. + Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." + """ + # USING 1 PROCESS + count_ents = 0 + for doc in nlp.pipe([text], n_process=1): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + # USING 2 PROCESSES + count_ents = 0 + for doc in nlp.pipe([text], n_process=2): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + + +class CustomPipe: + name = "my_pipe" + + def __init__(self): + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +def test_issue4903(): + """Ensure that this runs correctly and doesn't hang or crash on Windows / + macOS.""" + nlp = English() + custom_component = CustomPipe() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + nlp.add_pipe(custom_component, after="sentencizer") + + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." + + +def test_issue4924(): + nlp = Language() + example = Example.from_dict(nlp.make_doc(""), {}) + nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py deleted file mode 100644 index 6f96c9f2d..000000000 --- a/spacy/tests/regression/test_issue4528.py +++ /dev/null @@ -1,16 +0,0 @@ -from spacy.tokens import Doc, DocBin - - -def test_issue4528(en_vocab): - """Test that user_data is correctly serialized in DocBin.""" - doc = Doc(en_vocab, words=["hello", "world"]) - doc.user_data["foo"] = "bar" - # This is how extension attribute values are stored in the user data - doc.user_data[("._.", "foo", None, None)] = "bar" - doc_bin = DocBin(store_user_data=True) - doc_bin.add(doc) - doc_bin_bytes = doc_bin.to_bytes() - new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) - new_doc = list(new_doc_bin.get_docs(en_vocab))[0] - assert new_doc.user_data["foo"] == "bar" - assert new_doc.user_data[("._.", "foo", None, None)] == "bar" diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py deleted file mode 100644 index 0708499de..000000000 --- a/spacy/tests/regression/test_issue4529.py +++ /dev/null @@ -1,11 +0,0 @@ -import pytest - -from spacy.gold import Example - - -@pytest.mark.parametrize( - "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] -) -def test_gold_misaligned(en_tokenizer, text, words): - doc = en_tokenizer(text) - Example.from_dict(doc, {"words": words}) diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py deleted file mode 100644 index fc49c5117..000000000 --- a/spacy/tests/regression/test_issue4590.py +++ /dev/null @@ -1,35 +0,0 @@ -from mock import Mock -from spacy.matcher import DependencyMatcher -from ..util import get_doc - - -def test_issue4590(en_vocab): - """Test that matches param in on_match method are the same as matches run with no on_match method""" - pattern = [ - {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - { - "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, - "PATTERN": {"ORTH": "fox"}, - }, - { - "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, - "PATTERN": {"ORTH": "fox"}, - }, - ] - - on_match = Mock() - - matcher = DependencyMatcher(en_vocab) - matcher.add("pattern", on_match, pattern) - - text = "The quick brown fox jumped over the lazy fox" - heads = [3, 2, 1, 1, 0, -1, 2, 1, -3] - deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"] - - doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps) - - matches = matcher(doc) - - on_match_args = on_match.call_args - - assert on_match_args[0][3] == matches diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py deleted file mode 100644 index 3f6c1a57c..000000000 --- a/spacy/tests/regression/test_issue4651.py +++ /dev/null @@ -1,62 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - -from ..util import make_tempdir - - -def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - specified. - """ - text = "Spacy is a python library for nlp" - - nlp = English() - ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER") - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) - - nlp_reloaded.add_pipe(ruler_reloaded) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - - assert res == res_reloaded - - -def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialize correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - not specified. - """ - text = "Spacy is a python library for nlp" - - nlp = English() - ruler = EntityRuler(nlp) - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler.add_patterns(patterns) - nlp.add_pipe(ruler) - - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path) - - nlp_reloaded.add_pipe(ruler_reloaded) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - - assert res == res_reloaded diff --git a/spacy/tests/regression/test_issue4665.py b/spacy/tests/regression/test_issue4665.py deleted file mode 100644 index e28d0f44a..000000000 --- a/spacy/tests/regression/test_issue4665.py +++ /dev/null @@ -1,35 +0,0 @@ -import pytest - -# TODO -# from spacy.gold.converters.conllu2docs import conllu2docs - -input_data = """ -1 [ _ PUNCT -LRB- _ _ punct _ _ -2 This _ DET DT _ _ det _ _ -3 killing _ NOUN NN _ _ nsubj _ _ -4 of _ ADP IN _ _ case _ _ -5 a _ DET DT _ _ det _ _ -6 respected _ ADJ JJ _ _ amod _ _ -7 cleric _ NOUN NN _ _ nmod _ _ -8 will _ AUX MD _ _ aux _ _ -9 be _ AUX VB _ _ aux _ _ -10 causing _ VERB VBG _ _ root _ _ -11 us _ PRON PRP _ _ iobj _ _ -12 trouble _ NOUN NN _ _ dobj _ _ -13 for _ ADP IN _ _ case _ _ -14 years _ NOUN NNS _ _ nmod _ _ -15 to _ PART TO _ _ mark _ _ -16 come _ VERB VB _ _ acl _ _ -17 . _ PUNCT . _ _ punct _ _ -18 ] _ PUNCT -RRB- _ _ punct _ _ -""" - - -@pytest.mark.xfail -def test_issue4665(): - """ - conllu2json should not raise an exception if the HEAD column contains an - underscore - """ - pass - # conllu2json(input_data) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py deleted file mode 100644 index 149e1431b..000000000 --- a/spacy/tests/regression/test_issue4674.py +++ /dev/null @@ -1,36 +0,0 @@ -import pytest -from spacy.kb import KnowledgeBase -from spacy.util import ensure_path -from spacy.lang.en import English - -from ..util import make_tempdir - - -def test_issue4674(): - """Test that setting entities with overlapping identifiers does not mess up IO""" - nlp = English() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) - - vector1 = [0.9, 1.1, 1.01] - vector2 = [1.8, 2.25, 2.01] - with pytest.warns(UserWarning): - kb.set_entities( - entity_list=["Q1", "Q1"], - freq_list=[32, 111], - vector_list=[vector1, vector2], - ) - - assert kb.get_size_entities() == 1 - - # dumping to file & loading back in - with make_tempdir() as d: - dir_path = ensure_path(d) - if not dir_path.exists(): - dir_path.mkdir() - file_path = dir_path / "kb" - kb.dump(str(file_path)) - - kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) - kb2.load_bulk(str(file_path)) - - assert kb2.get_size_entities() == 1 diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py deleted file mode 100644 index d9798ef84..000000000 --- a/spacy/tests/regression/test_issue4707.py +++ /dev/null @@ -1,20 +0,0 @@ -from spacy.util import load_model_from_path -from spacy.lang.en import English - -from ..util import make_tempdir - - -def test_issue4707(): - """Tests that disabled component names are also excluded from nlp.from_disk - by default when loading a model. - """ - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(nlp.create_pipe("entity_ruler")) - assert nlp.pipe_names == ["sentencizer", "entity_ruler"] - exclude = ["tokenizer", "sentencizer"] - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir, exclude=exclude) - new_nlp = load_model_from_path(tmpdir, disable=exclude) - assert "sentencizer" not in new_nlp.pipe_names - assert "entity_ruler" in new_nlp.pipe_names diff --git a/spacy/tests/regression/test_issue4725.py b/spacy/tests/regression/test_issue4725.py deleted file mode 100644 index cdc3c09ca..000000000 --- a/spacy/tests/regression/test_issue4725.py +++ /dev/null @@ -1,41 +0,0 @@ -import pickle -import numpy - -from spacy.lang.en import English -from spacy.vocab import Vocab - -from spacy.tests.util import make_tempdir - - -def test_pickle_ner(): - """ Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") - nlp = English(vocab=vocab) - ner = nlp.create_pipe("ner", config={"min_action_freq": 342}) - with make_tempdir() as tmp_path: - with (tmp_path / "ner.pkl").open("wb") as file_: - pickle.dump(ner, file_) - assert ner.cfg["min_action_freq"] == 342 - - with (tmp_path / "ner.pkl").open("rb") as file_: - ner2 = pickle.load(file_) - assert ner2.cfg["min_action_freq"] == 342 - - -def test_issue4725(): - # ensures that this runs correctly and doesn't hang or crash because of the global vectors - # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows) - vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") - data[0] = 1.0 - data[1] = 2.0 - vocab.set_vector("cat", data[0]) - vocab.set_vector("dog", data[1]) - - nlp = English(vocab=vocab) - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - nlp.begin_training() - docs = ["Kurt is in London."] * 10 - for _ in nlp.pipe(docs, batch_size=2, n_process=2): - pass diff --git a/spacy/tests/regression/test_issue4849.py b/spacy/tests/regression/test_issue4849.py deleted file mode 100644 index ddbf6f7a0..000000000 --- a/spacy/tests/regression/test_issue4849.py +++ /dev/null @@ -1,34 +0,0 @@ -from spacy.lang.en import English -from spacy.pipeline import EntityRuler - - -def test_issue4849(): - nlp = English() - - ruler = EntityRuler( - nlp, - patterns=[ - {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, - {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, - ], - phrase_matcher_attr="LOWER", - ) - - nlp.add_pipe(ruler) - - text = """ - The left is starting to take aim at Democratic front-runner Joe Biden. - Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." - """ - - # USING 1 PROCESS - count_ents = 0 - for doc in nlp.pipe([text], n_process=1): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - - # USING 2 PROCESSES - count_ents = 0 - for doc in nlp.pipe([text], n_process=2): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 diff --git a/spacy/tests/regression/test_issue4903.py b/spacy/tests/regression/test_issue4903.py deleted file mode 100644 index a3dff16aa..000000000 --- a/spacy/tests/regression/test_issue4903.py +++ /dev/null @@ -1,40 +0,0 @@ -from spacy.lang.en import English -from spacy.tokens import Span, Doc - - -class CustomPipe: - name = "my_pipe" - - def __init__(self): - Span.set_extension("my_ext", getter=self._get_my_ext) - Doc.set_extension("my_ext", default=None) - - def __call__(self, doc): - gathered_ext = [] - for sent in doc.sents: - sent_ext = self._get_my_ext(sent) - sent._.set("my_ext", sent_ext) - gathered_ext.append(sent_ext) - - doc._.set("my_ext", "\n".join(gathered_ext)) - - return doc - - @staticmethod - def _get_my_ext(span): - return str(span.end) - - -def test_issue4903(): - # ensures that this runs correctly and doesn't hang or crash on Windows / macOS - - nlp = English() - custom_component = CustomPipe() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - nlp.add_pipe(custom_component, after="sentencizer") - - text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - docs = list(nlp.pipe(text, n_process=2)) - assert docs[0].text == "I like bananas." - assert docs[1].text == "Do you like them?" - assert docs[2].text == "No, I prefer wasabi." diff --git a/spacy/tests/regression/test_issue4924.py b/spacy/tests/regression/test_issue4924.py deleted file mode 100644 index c3d3c4326..000000000 --- a/spacy/tests/regression/test_issue4924.py +++ /dev/null @@ -1,8 +0,0 @@ -from spacy.gold import Example -from spacy.language import Language - - -def test_issue4924(): - nlp = Language() - example = Example.from_dict(nlp.make_doc(""), {}) - nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue5152.py b/spacy/tests/regression/test_issue5152.py index a9a57746d..3c1cee5c3 100644 --- a/spacy/tests/regression/test_issue5152.py +++ b/spacy/tests/regression/test_issue5152.py @@ -1,6 +1,8 @@ +import pytest from spacy.lang.en import English +@pytest.mark.filterwarnings("ignore::UserWarning") def test_issue5152(): # Test that the comparison between a Span and a Token, goes well # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) @@ -8,7 +10,6 @@ def test_issue5152(): text = nlp("Talk about being boring!") text_var = nlp("Talk of being boring!") y = nlp("Let") - span = text[0:3] # Talk about being span_2 = text[0:3] # Talk about being span_3 = text_var[0:3] # Talk of being diff --git a/spacy/tests/regression/test_issue5230.py b/spacy/tests/regression/test_issue5230.py index 9ffa3862c..86020bf17 100644 --- a/spacy/tests/regression/test_issue5230.py +++ b/spacy/tests/regression/test_issue5230.py @@ -63,7 +63,8 @@ def tagger(): # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization - tagger.begin_training(pipeline=nlp.pipeline) + with pytest.warns(UserWarning): + tagger.begin_training(pipeline=nlp.pipeline) return tagger