diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a4d321aa3..9a7d0744a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -143,15 +143,25 @@ Changes to `.py` files will be effective immediately. ### Fixing bugs When fixing a bug, first create an -[issue](https://github.com/explosion/spaCy/issues) if one does not already exist. -The description text can be very short – we don't want to make this too +[issue](https://github.com/explosion/spaCy/issues) if one does not already +exist. The description text can be very short – we don't want to make this too bureaucratic. -Next, create a test file named `test_issue[ISSUE NUMBER].py` in the -[`spacy/tests/regression`](spacy/tests/regression) folder. Test for the bug -you're fixing, and make sure the test fails. Next, add and commit your test file -referencing the issue number in the commit message. Finally, fix the bug, make -sure your test passes and reference the issue in your commit message. +Next, add a test to the relevant file in the +[`spacy/tests`](spacy/tests)folder. Then add a [pytest +mark](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers), +`@pytest.mark.issue(NUMBER)`, to reference the issue number. + +```python +# Assume you're fixing Issue #1234 +@pytest.mark.issue(1234) +def test_issue1234(): + ... +``` + +Test for the bug you're fixing, and make sure the test fails. Next, add and +commit your test file. Finally, fix the bug, make sure your test passes and +reference the issue number in your pull request description. 📖 **For more information on how to add tests, check out the [tests README](spacy/tests/README.md).** diff --git a/extra/DEVELOPER_DOCS/Code Conventions.md b/extra/DEVELOPER_DOCS/Code Conventions.md index 7a3f6996f..eba466c46 100644 --- a/extra/DEVELOPER_DOCS/Code Conventions.md +++ b/extra/DEVELOPER_DOCS/Code Conventions.md @@ -444,7 +444,7 @@ spaCy uses the [`pytest`](http://doc.pytest.org/) framework for testing. Tests f When adding tests, make sure to use descriptive names and only test for one behavior at a time. Tests should be grouped into modules dedicated to the same type of functionality and some test modules are organized as directories of test files related to the same larger area of the library, e.g. `matcher` or `tokenizer`. -Regression tests are tests that refer to bugs reported in specific issues. They should live in the `regression` module and are named according to the issue number (e.g. `test_issue1234.py`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. Every once in a while, we go through the `regression` module and group tests together into larger files by issue number, in groups of 500 to 1000 numbers. This prevents us from ending up with too many individual files over time. +Regression tests are tests that refer to bugs reported in specific issues. They should live in the relevant module of the test suite, named according to the issue number (e.g., `test_issue1234.py`), and [marked](https://docs.pytest.org/en/6.2.x/example/markers.html#working-with-custom-markers) appropriately (e.g. `@pytest.mark.issue(1234)`). This system allows us to relate tests for specific bugs back to the original reported issue, which is especially useful if we introduce a regression and a previously passing regression tests suddenly fails again. When fixing a bug, it's often useful to create a regression test for it first. The test suite also provides [fixtures](https://github.com/explosion/spaCy/blob/master/spacy/tests/conftest.py) for different language tokenizers that can be used as function arguments of the same name and will be passed in automatically. Those should only be used for tests related to those specific languages. We also have [test utility functions](https://github.com/explosion/spaCy/blob/master/spacy/tests/util.py) for common operations, like creating a temporary file. diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index ef54c581c..c334cc6eb 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,8 +1,31 @@ +import numpy import pytest + from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP, MORPH +@pytest.mark.issue(2203) +def test_issue2203(en_vocab): + """Test that lemmas are set correctly in doc.from_array.""" + words = ["I", "'ll", "survive"] + tags = ["PRP", "MD", "VB"] + lemmas = ["-PRON-", "will", "survive"] + tag_ids = [en_vocab.strings.add(tag) for tag in tags] + lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] + doc = Doc(en_vocab, words=words) + # Work around lemma corruption problem and set lemmas after tags + doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) + doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) + assert [t.tag_ for t in doc] == tags + assert [t.lemma_ for t in doc] == lemmas + # We need to serialize both tag and lemma, since this is what causes the bug + doc_array = doc.to_array(["TAG", "LEMMA"]) + new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) + assert [t.tag_ for t in new_doc] == tags + assert [t.lemma_ for t in new_doc] == lemmas + + def test_doc_array_attr_of_token(en_vocab): doc = Doc(en_vocab, words=["An", "example", "sentence"]) example = doc.vocab["example"] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 57df87642..c6195d7e2 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,14 +1,17 @@ import weakref -import pytest import numpy +import pytest +from thinc.api import NumpyOps, get_current_ops +from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS +from spacy.attrs import SENT_START, TAG +from spacy.lang.en import English from spacy.lang.xx import MultiLanguage +from spacy.language import Language +from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab -from spacy.lexeme import Lexeme -from spacy.lang.en import English -from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH from .test_underscore import clean_underscore # noqa: F401 @@ -30,6 +33,220 @@ def test_doc_api_init(en_vocab): assert [t.is_sent_start for t in doc] == [True, False, True, False] +@pytest.mark.issue(1547) +def test_issue1547(): + """Test that entity labels still match after merging tokens.""" + words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] + doc = Doc(Vocab(), words=words) + doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[5:7]) + assert [ent.text for ent in doc.ents] + + +@pytest.mark.issue(1757) +def test_issue1757(): + """Test comparison against None doesn't cause segfault.""" + doc = Doc(Vocab(), words=["a", "b", "c"]) + assert not doc[0] < None + assert not doc[0] is None + assert doc[0] >= None + assert not doc[:2] < None + assert not doc[:2] is None + assert doc[:2] >= None + assert not doc.vocab["a"] is None + assert not doc.vocab["a"] < None + + +@pytest.mark.issue(2396) +def test_issue2396(en_vocab): + words = ["She", "created", "a", "test", "for", "spacy"] + heads = [1, 1, 3, 1, 3, 4] + deps = ["dep"] * len(heads) + matrix = numpy.array( + [ + [0, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1], + [1, 1, 2, 3, 3, 3], + [1, 1, 3, 3, 3, 3], + [1, 1, 3, 3, 4, 4], + [1, 1, 3, 3, 4, 5], + ], + dtype=numpy.int32, + ) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + span = doc[:] + assert (doc.get_lca_matrix() == matrix).all() + assert (span.get_lca_matrix() == matrix).all() + + +@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) +@pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) +@pytest.mark.issue(2782) +def test_issue2782(text, lang_cls): + """Check that like_num handles + and - before number.""" + nlp = lang_cls() + doc = nlp(text) + assert len(doc) == 1 + assert doc[0].like_num + + +@pytest.mark.parametrize( + "sentence", + [ + "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", + "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", + "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", + "It was a missed assignment, but it shouldn't have resulted in a turnover ...", + ], +) +@pytest.mark.issue(3869) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + doc = nlp(sentence) + count = 0 + for token in doc: + count += token.is_alpha + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + +@pytest.mark.issue(3962) +def test_issue3962(en_vocab): + """Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] + heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7] + deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = doc[1:5] # "jests at scars ," + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "dep" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" # head set to the new artificial root + assert doc2[3].dep_ == "dep" + # We should still have 1 sentence + assert len(list(doc2.sents)) == 1 + span3 = doc[6:9] # "never felt a" + doc3 = span3.as_doc() + doc3_json = doc3.to_json() + assert doc3_json + assert doc3[0].head.text == "felt" + assert doc3[0].dep_ == "neg" + assert doc3[1].head.text == "felt" + assert doc3[1].dep_ == "ROOT" + assert doc3[2].head.text == "felt" # head set to ancestor + assert doc3[2].dep_ == "dep" + # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" + assert len(list(doc3.sents)) == 1 + + +@pytest.mark.issue(3962) +def test_issue3962_long(en_vocab): + """Ensure that as_doc does not result in out-of-bound access of tokens. + This is achieved by setting the head to itself if it would lie out of the span otherwise.""" + # fmt: off + words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] + heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7] + deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] + # fmt: on + two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + span2 = two_sent_doc[1:7] # "jests at scars. They never" + doc2 = span2.as_doc() + doc2_json = doc2.to_json() + assert doc2_json + # head set to itself, being the new artificial root (in sentence 1) + assert doc2[0].head.text == "jests" + assert doc2[0].dep_ == "ROOT" + assert doc2[1].head.text == "jests" + assert doc2[1].dep_ == "prep" + assert doc2[2].head.text == "at" + assert doc2[2].dep_ == "pobj" + assert doc2[3].head.text == "jests" + assert doc2[3].dep_ == "punct" + # head set to itself, being the new artificial root (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # head set to the new artificial head (in sentence 2) + assert doc2[4].head.text == "They" + assert doc2[4].dep_ == "dep" + # We should still have 2 sentences + sents = list(doc2.sents) + assert len(sents) == 2 + assert sents[0].text == "jests at scars ." + assert sents[1].text == "They never" + + +@Language.factory("my_pipe") +class CustomPipe: + def __init__(self, nlp, name="my_pipe"): + self.name = name + Span.set_extension("my_ext", getter=self._get_my_ext) + Doc.set_extension("my_ext", default=None) + + def __call__(self, doc): + gathered_ext = [] + for sent in doc.sents: + sent_ext = self._get_my_ext(sent) + sent._.set("my_ext", sent_ext) + gathered_ext.append(sent_ext) + + doc._.set("my_ext", "\n".join(gathered_ext)) + return doc + + @staticmethod + def _get_my_ext(span): + return str(span.end) + + +@pytest.mark.issue(4903) +def test_issue4903(): + """Ensure that this runs correctly and doesn't hang or crash on Windows / + macOS.""" + nlp = English() + nlp.add_pipe("sentencizer") + nlp.add_pipe("my_pipe", after="sentencizer") + text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] + if isinstance(get_current_ops(), NumpyOps): + docs = list(nlp.pipe(text, n_process=2)) + assert docs[0].text == "I like bananas." + assert docs[1].text == "Do you like them?" + assert docs[2].text == "No, I prefer wasabi." + + +@pytest.mark.issue(5048) +def test_issue5048(en_vocab): + words = ["This", "is", "a", "sentence"] + pos_s = ["DET", "VERB", "DET", "NOUN"] + spaces = [" ", " ", " ", ""] + deps_s = ["dep", "adj", "nn", "atm"] + tags_s = ["DT", "VBZ", "DT", "NN"] + strings = en_vocab.strings + for w in words: + strings.add(w) + deps = [strings.add(d) for d in deps_s] + pos = [strings.add(p) for p in pos_s] + tags = [strings.add(t) for t in tags_s] + attrs = [POS, DEP, TAG] + array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") + doc = Doc(en_vocab, words=words, spaces=spaces) + doc.from_array(attrs, array) + v1 = [(token.text, token.pos_, token.tag_) for token in doc] + doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) + v2 = [(token.text, token.pos_, token.tag_) for token in doc2] + assert v1 == v2 + + @pytest.mark.parametrize("text", [["one", "two", "three"]]) def test_doc_api_compare_by_string_position(en_vocab, text): doc = Doc(en_vocab, words=text) diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 16df1713d..ec4deb033 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -1,8 +1,50 @@ +import numpy import pytest + from spacy.vocab import Vocab from spacy.tokens import Doc, Token +@pytest.mark.issue(3540) +def test_issue3540(en_vocab): + words = ["I", "live", "in", "NewYork", "right", "now"] + tensor = numpy.asarray( + [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], + dtype="f", + ) + doc = Doc(en_vocab, words=words) + doc.tensor = tensor + gold_text = ["I", "live", "in", "NewYork", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] + for i, lemma in enumerate(gold_lemma): + doc[i].lemma_ = lemma + assert [token.lemma_ for token in doc] == gold_lemma + vectors_1 = [token.vector for token in doc] + assert len(vectors_1) == len(doc) + + with doc.retokenize() as retokenizer: + heads = [(doc[3], 1), doc[2]] + attrs = { + "POS": ["PROPN", "PROPN"], + "LEMMA": ["New", "York"], + "DEP": ["pobj", "compound"], + } + retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) + + gold_text = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.text for token in doc] == gold_text + gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] + assert [token.lemma_ for token in doc] == gold_lemma + vectors_2 = [token.vector for token in doc] + assert len(vectors_2) == len(doc) + assert vectors_1[0].tolist() == vectors_2[0].tolist() + assert vectors_1[1].tolist() == vectors_2[1].tolist() + assert vectors_1[2].tolist() == vectors_2[2].tolist() + assert vectors_1[4].tolist() == vectors_2[5].tolist() + assert vectors_1[5].tolist() == vectors_2[6].tolist() + + def test_doc_retokenize_split(en_vocab): words = ["LosAngeles", "start", "."] heads = [1, 2, 2] diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 2503ad94c..d18293d3f 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,7 +1,9 @@ import pytest import numpy from numpy.testing import assert_array_equal + from spacy.attrs import ORTH, LENGTH +from spacy.lang.en import English from spacy.tokens import Doc, Span, Token from spacy.vocab import Vocab from spacy.util import filter_spans @@ -43,6 +45,106 @@ def doc_not_parsed(en_tokenizer): return doc +@pytest.mark.issue(1537) +def test_issue1537(): + """Test that Span.as_doc() doesn't segfault.""" + string = "The sky is blue . The man is pink . The dog is purple ." + doc = Doc(Vocab(), words=string.split()) + doc[0].sent_start = True + for word in doc[1:]: + if word.nbor(-1).text == ".": + word.sent_start = True + else: + word.sent_start = False + sents = list(doc.sents) + sent0 = sents[0].as_doc() + sent1 = sents[1].as_doc() + assert isinstance(sent0, Doc) + assert isinstance(sent1, Doc) + + +@pytest.mark.issue(1612) +def test_issue1612(en_tokenizer): + """Test that span.orth_ is identical to span.text""" + doc = en_tokenizer("The black cat purrs.") + span = doc[1:3] + assert span.orth_ == span.text + + +@pytest.mark.issue(3199) +def test_issue3199(): + """Test that Span.noun_chunks works correctly if no noun chunks iterator + is available. To make this test future-proof, we're constructing a Doc + with a new Vocab here and a parse tree to make sure the noun chunks run. + """ + words = ["This", "is", "a", "sentence"] + doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) + with pytest.raises(NotImplementedError): + list(doc[0:3].noun_chunks) + + +@pytest.mark.issue(5152) +def test_issue5152(): + # Test that the comparison between a Span and a Token, goes well + # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) + nlp = English() + text = nlp("Talk about being boring!") + text_var = nlp("Talk of being boring!") + y = nlp("Let") + span = text[0:3] # Talk about being + span_2 = text[0:3] # Talk about being + span_3 = text_var[0:3] # Talk of being + token = y[0] # Let + with pytest.warns(UserWarning): + assert span.similarity(token) == 0.0 + assert span.similarity(span_2) == 1.0 + with pytest.warns(UserWarning): + assert span_2.similarity(span_3) < 1.0 + + +@pytest.mark.issue(6755) +def test_issue6755(en_tokenizer): + doc = en_tokenizer("This is a magnificent sentence.") + span = doc[:0] + assert span.text_with_ws == "" + assert span.text == "" + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,label", + [("Welcome to Mumbai, my friend", 11, 17, "GPE")], +) +@pytest.mark.issue(6815) +def test_issue6815_1(sentence, start_idx, end_idx, label): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, label=label) + assert span.label_ == label + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] +) +@pytest.mark.issue(6815) +def test_issue6815_2(sentence, start_idx, end_idx, kb_id): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) + assert span.kb_id == kb_id + + +@pytest.mark.parametrize( + "sentence, start_idx,end_idx,vector", + [("Welcome to Mumbai, my friend", 11, 17, numpy.array([0.1, 0.2, 0.3]))], +) +@pytest.mark.issue(6815) +def test_issue6815_3(sentence, start_idx, end_idx, vector): + nlp = English() + doc = nlp(sentence) + span = doc[:].char_span(start_idx, end_idx, vector=vector) + assert (span.vector == vector).all() + + @pytest.mark.parametrize( "i_sent,i,j,text", [ diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 39d8d3b59..d30c72750 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -4,6 +4,15 @@ from spacy.tokens import Doc from ...util import apply_transition_sequence +@pytest.mark.issue(309) +def test_issue309(en_vocab): + """Test Issue #309: SBD fails on empty string""" + doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"]) + assert len(doc) == 1 + sents = list(doc.sents) + assert len(sents) == 1 + + @pytest.mark.parametrize("words", [["A", "test", "sentence"]]) @pytest.mark.parametrize("punct", [".", "!", "?", ""]) def test_en_sbd_single_punct(en_vocab, words, punct): diff --git a/spacy/tests/lang/en/test_tokenizer.py b/spacy/tests/lang/en/test_tokenizer.py new file mode 100644 index 000000000..e6d1d7d85 --- /dev/null +++ b/spacy/tests/lang/en/test_tokenizer.py @@ -0,0 +1,169 @@ +import pytest + + +@pytest.mark.issue(351) +def test_issue351(en_tokenizer): + doc = en_tokenizer(" This is a cat.") + assert doc[0].idx == 0 + assert len(doc[0]) == 3 + assert doc[1].idx == 3 + + +@pytest.mark.issue(360) +def test_issue360(en_tokenizer): + """Test tokenization of big ellipsis""" + tokens = en_tokenizer("$45...............Asking") + assert len(tokens) > 2 + + +@pytest.mark.issue(736) +@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) +def test_issue736(en_tokenizer, text, number): + """Test that times like "7am" are tokenized correctly and that numbers are + converted to string.""" + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == number + + +@pytest.mark.issue(740) +@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) +def test_issue740(en_tokenizer, text): + """Test that dates are not split and kept as one token. This behaviour is + currently inconsistent, since dates separated by hyphens are still split. + This will be hard to prevent without causing clashes with numeric ranges.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1 + + +@pytest.mark.issue(744) +@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) +def test_issue744(en_tokenizer, text): + """Test that 'were' and 'Were' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text.lower() == "were" + + +@pytest.mark.issue(759) +@pytest.mark.parametrize( + "text,is_num", [("one", True), ("ten", True), ("teneleven", False)] +) +def test_issue759(en_tokenizer, text, is_num): + tokens = en_tokenizer(text) + assert tokens[0].like_num == is_num + + +@pytest.mark.issue(775) +@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) +def test_issue775(en_tokenizer, text): + """Test that 'Shell' and 'shell' are excluded from the contractions + generated by the English tokenizer exceptions.""" + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text + + +@pytest.mark.issue(792) +@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) +def test_issue792(en_tokenizer, text): + """Test for Issue #792: Trailing whitespace is removed after tokenization.""" + doc = en_tokenizer(text) + assert "".join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.issue(792) +@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) +def test_control_issue792(en_tokenizer, text): + """Test base case for Issue #792: Non-trailing whitespace""" + doc = en_tokenizer(text) + assert "".join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.issue(859) +@pytest.mark.parametrize( + "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] +) +def test_issue859(en_tokenizer, text): + """Test that no extra space is added in doc.text method.""" + doc = en_tokenizer(text) + assert doc.text == text + + +@pytest.mark.issue(886) +@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) +def test_issue886(en_tokenizer, text): + """Test that token.idx matches the original text index for texts with newlines.""" + doc = en_tokenizer(text) + for token in doc: + assert len(token.text) == len(token.text_with_ws) + assert text[token.idx] == token.text[0] + + +@pytest.mark.issue(891) +@pytest.mark.parametrize("text", ["want/need"]) +def test_issue891(en_tokenizer, text): + """Test that / infixes are split correctly.""" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "/" + + +@pytest.mark.issue(957) +@pytest.mark.slow +def test_issue957(en_tokenizer): + """Test that spaCy doesn't hang on many punctuation characters. + If this test hangs, check (new) regular expressions for conflicting greedy operators + """ + # Skip test if pytest-timeout is not installed + pytest.importorskip("pytest_timeout") + for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]: + string = "0" + for i in range(1, 100): + string += punct + str(i) + doc = en_tokenizer(string) + assert doc + + +@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) +@pytest.mark.issue(1698) +def test_issue1698(en_tokenizer, text): + """Test that doc doesn't identify email-addresses as URLs""" + doc = en_tokenizer(text) + assert len(doc) == 1 + assert not doc[0].like_url + + +@pytest.mark.issue(1758) +def test_issue1758(en_tokenizer): + """Test that "would've" is handled by the English tokenizer exceptions.""" + tokens = en_tokenizer("would've") + assert len(tokens) == 2 + + +@pytest.mark.issue(1773) +def test_issue1773(en_tokenizer): + """Test that spaces don't receive a POS but no TAG. This is the root cause + of the serialization issue reported in #1773.""" + doc = en_tokenizer("\n") + if doc[0].pos_ == "SPACE": + assert doc[0].tag_ != "" + + +@pytest.mark.issue(3277) +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013" + + +@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) +@pytest.mark.issue(3521) +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms + assert tok.is_stop diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index 96f6bcab5..d95f6d26b 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -1,5 +1,16 @@ import pytest from spacy.lang.es.lex_attrs import like_num +from spacy.lang.es import Spanish + + +@pytest.mark.issue(3803) +def test_issue3803(): + """Test that spanish num-like tokens have True for like_num attribute.""" + nlp = Spanish() + text = "2 dos 1000 mil 12 doce" + doc = nlp(text) + + assert [t.like_num for t in doc] == [True, True, True, True, True, True] def test_es_tokenizer_handles_long_text(es_tokenizer): diff --git a/spacy/tests/lang/hi/test_text.py b/spacy/tests/lang/hi/test_text.py new file mode 100644 index 000000000..791cc3822 --- /dev/null +++ b/spacy/tests/lang/hi/test_text.py @@ -0,0 +1,11 @@ +import pytest +from spacy.lang.hi import Hindi + + +@pytest.mark.issue(3625) +def test_issue3625(): + """Test that default punctuation rules applies to hindi unicode characters""" + nlp = Hindi() + doc = nlp("hi. how हुए. होटल, होटल") + expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] + assert [token.text for token in doc] == expected diff --git a/spacy/tests/lang/it/test_text.py b/spacy/tests/lang/it/test_text.py new file mode 100644 index 000000000..6023a20b1 --- /dev/null +++ b/spacy/tests/lang/it/test_text.py @@ -0,0 +1,14 @@ +import pytest + + +@pytest.mark.issue(2822) +def test_issue2822(it_tokenizer): + """Test that the abbreviation of poco is kept as one word.""" + doc = it_tokenizer("Vuoi un po' di zucchero?") + assert len(doc) == 6 + assert doc[0].text == "Vuoi" + assert doc[1].text == "un" + assert doc[2].text == "po'" + assert doc[3].text == "di" + assert doc[4].text == "zucchero" + assert doc[5].text == "?" diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 3437ea283..ef7bed06d 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -54,6 +54,18 @@ SUB_TOKEN_TESTS = [ # fmt: on +@pytest.mark.issue(2901) +def test_issue2901(): + """Test that `nlp` doesn't fail.""" + try: + nlp = Japanese() + except ImportError: + pytest.skip() + + doc = nlp("pythonが大好きです") + assert doc + + @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) def test_ja_tokenizer(ja_tokenizer, text, expected_tokens): tokens = [token.text for token in ja_tokenizer(text)] diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index e6cae4d2b..b49a0c832 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -1,6 +1,5 @@ import pytest - SV_TOKEN_EXCEPTION_TESTS = [ ( "Smörsåsen används bl.a. till fisk", @@ -17,6 +16,26 @@ SV_TOKEN_EXCEPTION_TESTS = [ ] +@pytest.mark.issue(805) +@pytest.mark.parametrize( + "text,expected_tokens", + [ + ( + "Smörsåsen används bl.a. till fisk", + ["Smörsåsen", "används", "bl.a.", "till", "fisk"], + ), + ( + "Jag kommer först kl. 13 p.g.a. diverse förseningar", + ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], + ), + ], +) +def test_issue805(sv_tokenizer, text, expected_tokens): + tokens = sv_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list + + @pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS) def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): tokens = sv_tokenizer(text) diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 6a7a404fd..5350c1fe5 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,6 +1,15 @@ import pytest -from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA -from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape + +from spacy.attrs import IS_ALPHA, LEMMA, NORM, ORTH, intify_attrs +from spacy.lang.en.stop_words import STOP_WORDS +from spacy.lang.lex_attrs import is_ascii, is_currency, is_punct, is_stop +from spacy.lang.lex_attrs import like_url, word_shape + + +@pytest.mark.parametrize("word", ["the"]) +@pytest.mark.issue(1889) +def test_issue1889(word): + assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) @pytest.mark.parametrize("text", ["dog"]) diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index b96bb2032..3649b07ed 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -1,10 +1,14 @@ -import pytest import re -from spacy.lang.en import English -from spacy.matcher import Matcher -from spacy.tokens import Doc, Span +import pytest +from spacy.attrs import IS_PUNCT, LOWER, ORTH +from spacy.errors import MatchPatternError +from spacy.lang.en import English +from spacy.lang.lex_attrs import LEX_ATTRS +from spacy.matcher import Matcher +from spacy.tokens import Doc, Span, Token +from spacy.vocab import Vocab pattern1 = [{"ORTH": "A"}, {"ORTH": "A", "OP": "*"}] pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A"}] @@ -36,6 +40,473 @@ def doc(en_tokenizer, text): return doc +@pytest.mark.issue(118) +@pytest.mark.parametrize( + "patterns", + [ + [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], + [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]], + ], +) +def test_issue118(en_tokenizer, patterns): + """Test a bug that arose from having overlapping matches""" + text = ( + "how many points did lebron james score against the boston celtics last night" + ) + doc = en_tokenizer(text) + ORG = doc.vocab.strings["ORG"] + matcher = Matcher(doc.vocab) + matcher.add("BostonCeltics", patterns) + assert len(list(doc.ents)) == 0 + matches = [(ORG, start, end) for _, start, end in matcher(doc)] + assert matches == [(ORG, 9, 11), (ORG, 10, 11)] + doc.ents = matches[:1] + ents = list(doc.ents) + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +@pytest.mark.issue(118) +@pytest.mark.parametrize( + "patterns", + [ + [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], + [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]], + ], +) +def test_issue118_prefix_reorder(en_tokenizer, patterns): + """Test a bug that arose from having overlapping matches""" + text = ( + "how many points did lebron james score against the boston celtics last night" + ) + doc = en_tokenizer(text) + ORG = doc.vocab.strings["ORG"] + matcher = Matcher(doc.vocab) + matcher.add("BostonCeltics", patterns) + assert len(list(doc.ents)) == 0 + matches = [(ORG, start, end) for _, start, end in matcher(doc)] + doc.ents += tuple(matches)[1:] + assert matches == [(ORG, 9, 10), (ORG, 9, 11)] + ents = doc.ents + assert len(ents) == 1 + assert ents[0].label == ORG + assert ents[0].start == 9 + assert ents[0].end == 11 + + +@pytest.mark.issue(242) +def test_issue242(en_tokenizer): + """Test overlapping multi-word phrases.""" + text = "There are different food safety standards in different countries." + patterns = [ + [{"LOWER": "food"}, {"LOWER": "safety"}], + [{"LOWER": "safety"}, {"LOWER": "standards"}], + ] + doc = en_tokenizer(text) + matcher = Matcher(doc.vocab) + matcher.add("FOOD", patterns) + matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] + match1, match2 = matches + assert match1[1] == 3 + assert match1[2] == 5 + assert match2[1] == 4 + assert match2[2] == 6 + with pytest.raises(ValueError): + # One token can only be part of one entity, so test that the matches + # can't be added as entities + doc.ents += tuple(matches) + + +@pytest.mark.issue(587) +def test_issue587(en_tokenizer): + """Test that Matcher doesn't segfault on particular input""" + doc = en_tokenizer("a b; c") + matcher = Matcher(doc.vocab) + matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]]) + matches = matcher(doc) + assert len(matches) == 1 + matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]]) + matches = matcher(doc) + assert len(matches) == 2 + matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]]) + matches = matcher(doc) + assert len(matches) == 2 + + +@pytest.mark.issue(588) +def test_issue588(en_vocab): + """Test if empty specs still cause an error when adding patterns""" + matcher = Matcher(en_vocab) + with pytest.raises(ValueError): + matcher.add("TEST", [[]]) + + +@pytest.mark.issue(590) +def test_issue590(en_vocab): + """Test overlapping matches""" + doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) + matcher = Matcher(en_vocab) + matcher.add( + "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]] + ) + matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]]) + matches = matcher(doc) + assert len(matches) == 2 + + +@pytest.mark.issue(615) +def test_issue615(en_tokenizer): + def merge_phrases(matcher, doc, i, matches): + """Merge a phrase. We have to be careful here because we'll change the + token indices. To avoid problems, merge all the phrases once we're called + on the last match.""" + if i != len(matches) - 1: + return None + spans = [Span(doc, start, end, label=label) for label, start, end in matches] + with doc.retokenize() as retokenizer: + for span in spans: + tag = "NNP" if span.label_ else span.root.tag_ + attrs = {"tag": tag, "lemma": span.text} + retokenizer.merge(span, attrs=attrs) + doc.ents = doc.ents + (span,) + + text = "The golf club is broken" + pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] + label = "Sport_Equipment" + doc = en_tokenizer(text) + matcher = Matcher(doc.vocab) + matcher.add(label, [pattern], on_match=merge_phrases) + matcher(doc) + entities = list(doc.ents) + assert entities != [] + assert entities[0].label != 0 + + +@pytest.mark.issue(850) +def test_issue850(): + """The variable-length pattern matches the succeeding token. Check we + handle the ambiguity correctly.""" + vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) + matcher = Matcher(vocab) + pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] + matcher.add("FarAway", [pattern]) + doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) + match = matcher(doc) + assert len(match) == 1 + ent_id, start, end = match[0] + assert start == 0 + assert end == 4 + + +@pytest.mark.issue(850) +def test_issue850_basic(): + """Test Matcher matches with '*' operator and Boolean flag""" + vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) + matcher = Matcher(vocab) + pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] + matcher.add("FarAway", [pattern]) + doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) + match = matcher(doc) + assert len(match) == 1 + ent_id, start, end = match[0] + assert start == 0 + assert end == 4 + + +@pytest.mark.issue(1434) +def test_issue1434(): + """Test matches occur when optional element at end of short doc.""" + pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] + vocab = Vocab(lex_attr_getters=LEX_ATTRS) + hello_world = Doc(vocab, words=["Hello", "World"]) + hello = Doc(vocab, words=["Hello"]) + matcher = Matcher(vocab) + matcher.add("MyMatcher", [pattern]) + matches = matcher(hello_world) + assert matches + matches = matcher(hello) + assert matches + + +@pytest.mark.parametrize( + "string,start,end", + [ + ("a", 0, 1), + ("a b", 0, 2), + ("a c", 0, 1), + ("a b c", 0, 2), + ("a b b c", 0, 3), + ("a b b", 0, 3), + ], +) +@pytest.mark.issue(1450) +def test_issue1450(string, start, end): + """Test matcher works when patterns end with * operator.""" + pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] + matcher = Matcher(Vocab()) + matcher.add("TSTEND", [pattern]) + doc = Doc(Vocab(), words=string.split()) + matches = matcher(doc) + if start is None or end is None: + assert matches == [] + assert matches[-1][1] == start + assert matches[-1][2] == end + + +@pytest.mark.issue(1945) +def test_issue1945(): + """Test regression in Matcher introduced in v2.0.6.""" + matcher = Matcher(Vocab()) + matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) + doc = Doc(matcher.vocab, words=["a", "a", "a"]) + matches = matcher(doc) # we should see two overlapping matches here + assert len(matches) == 2 + assert matches[0][1:] == (0, 2) + assert matches[1][1:] == (1, 3) + + +@pytest.mark.issue(1971) +def test_issue1971(en_vocab): + # Possibly related to #2675 and #2671? + matcher = Matcher(en_vocab) + pattern = [ + {"ORTH": "Doe"}, + {"ORTH": "!", "OP": "?"}, + {"_": {"optional": True}, "OP": "?"}, + {"ORTH": "!", "OP": "?"}, + ] + Token.set_extension("optional", default=False) + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) + # We could also assert length 1 here, but this is more conclusive, because + # the real problem here is that it returns a duplicate match for a match_id + # that's not actually in the vocab! + matches = matcher(doc) + assert all([match_id in en_vocab.strings for match_id, start, end in matches]) + + +@pytest.mark.issue(1971) +def test_issue_1971_2(en_vocab): + matcher = Matcher(en_vocab) + pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] + pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] + doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) + matcher.add("TEST1", [pattern1, pattern2]) + matches = matcher(doc) + assert len(matches) == 2 + + +@pytest.mark.issue(1971) +def test_issue_1971_3(en_vocab): + """Test that pattern matches correctly for multiple extension attributes.""" + Token.set_extension("a", default=1, force=True) + Token.set_extension("b", default=2, force=True) + doc = Doc(en_vocab, words=["hello", "world"]) + matcher = Matcher(en_vocab) + matcher.add("A", [[{"_": {"a": 1}}]]) + matcher.add("B", [[{"_": {"b": 2}}]]) + matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) + assert len(matches) == 4 + assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) + + +@pytest.mark.issue(1971) +def test_issue_1971_4(en_vocab): + """Test that pattern matches correctly with multiple extension attribute + values on a single token. + """ + Token.set_extension("ext_a", default="str_a", force=True) + Token.set_extension("ext_b", default="str_b", force=True) + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["this", "is", "text"]) + pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 + matcher.add("TEST", [pattern]) + matches = matcher(doc) + # Uncommenting this caused a segmentation fault + assert len(matches) == 1 + assert matches[0] == (en_vocab.strings["TEST"], 0, 3) + + +@pytest.mark.issue(2464) +def test_issue2464(en_vocab): + """Test problem with successive ?. This is the same bug, so putting it here.""" + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["a", "b"]) + matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]]) + matches = matcher(doc) + assert len(matches) == 3 + + +@pytest.mark.issue(2569) +def test_issue2569(en_tokenizer): + """Test that operator + is greedy.""" + doc = en_tokenizer("It is May 15, 1993.") + doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] + matcher = Matcher(doc.vocab) + matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]]) + matched = [doc[start:end] for _, start, end in matcher(doc)] + matched = sorted(matched, key=len, reverse=True) + assert len(matched) == 10 + assert len(matched[0]) == 4 + assert matched[0].text == "May 15, 1993" + + +@pytest.mark.issue(2671) +def test_issue2671(): + """Ensure the correct entity ID is returned for matches with quantifiers. + See also #2675 + """ + nlp = English() + matcher = Matcher(nlp.vocab) + pattern_id = "test_pattern" + pattern = [ + {"LOWER": "high"}, + {"IS_PUNCT": True, "OP": "?"}, + {"LOWER": "adrenaline"}, + ] + matcher.add(pattern_id, [pattern]) + doc1 = nlp("This is a high-adrenaline situation.") + doc2 = nlp("This is a high adrenaline situation.") + matches1 = matcher(doc1) + for match_id, start, end in matches1: + assert nlp.vocab.strings[match_id] == pattern_id + matches2 = matcher(doc2) + for match_id, start, end in matches2: + assert nlp.vocab.strings[match_id] == pattern_id + + +@pytest.mark.issue(3009) +def test_issue3009(en_vocab): + """Test problem with matcher quantifiers""" + patterns = [ + [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}], + [ + {"ORTH": "has"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"TAG": "IN"}, + ], + [ + {"ORTH": "has"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"TAG": "IN"}, + ], + ] + words = ["also", "has", "to", "do", "with"] + tags = ["RB", "VBZ", "TO", "VB", "IN"] + pos = ["ADV", "VERB", "ADP", "VERB", "ADP"] + doc = Doc(en_vocab, words=words, tags=tags, pos=pos) + matcher = Matcher(en_vocab) + for i, pattern in enumerate(patterns): + matcher.add(str(i), [pattern]) + matches = matcher(doc) + assert matches + + +@pytest.mark.issue(3328) +def test_issue3328(en_vocab): + doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) + matcher = Matcher(en_vocab) + patterns = [ + [{"LOWER": {"IN": ["hello", "how"]}}], + [{"LOWER": {"IN": ["you", "doing"]}}], + ] + matcher.add("TEST", patterns) + matches = matcher(doc) + assert len(matches) == 4 + matched_texts = [doc[start:end].text for _, start, end in matches] + assert matched_texts == ["Hello", "how", "you", "doing"] + + +@pytest.mark.issue(3549) +def test_issue3549(en_vocab): + """Test that match pattern validation doesn't raise on empty errors.""" + matcher = Matcher(en_vocab, validate=True) + pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] + matcher.add("GOOD", [pattern]) + with pytest.raises(MatchPatternError): + matcher.add("BAD", [[{"X": "Y"}]]) + + +@pytest.mark.skip("Matching currently only works on strings and integers") +@pytest.mark.issue(3555) +def test_issue3555(en_vocab): + """Test that custom extensions with default None don't break matcher.""" + Token.set_extension("issue3555", default=None) + matcher = Matcher(en_vocab) + pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["have", "apple"]) + matcher(doc) + + +@pytest.mark.issue(3839) +def test_issue3839(en_vocab): + """Test that match IDs returned by the matcher are correct, are in the string""" + doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) + matcher = Matcher(en_vocab) + match_id = "PATTERN" + pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] + pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] + matcher.add(match_id, [pattern1]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + matcher = Matcher(en_vocab) + matcher.add(match_id, [pattern2]) + matches = matcher(doc) + assert matches[0][0] == en_vocab.strings[match_id] + + +@pytest.mark.issue(3879) +def test_issue3879(en_vocab): + doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) + assert len(doc) == 5 + pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] + matcher = Matcher(en_vocab) + matcher.add("TEST", [pattern]) + assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' + + +@pytest.mark.issue(3951) +def test_issue3951(en_vocab): + """Test that combinations of optional rules are matched correctly.""" + matcher = Matcher(en_vocab) + pattern = [ + {"LOWER": "hello"}, + {"LOWER": "this", "OP": "?"}, + {"OP": "?"}, + {"LOWER": "world"}, + ] + matcher.add("TEST", [pattern]) + doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) + matches = matcher(doc) + assert len(matches) == 0 + + +@pytest.mark.issue(4120) +def test_issue4120(en_vocab): + """Test that matches without a final {OP: ?} token are returned.""" + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) + doc1 = Doc(en_vocab, words=["a"]) + assert len(matcher(doc1)) == 1 # works + doc2 = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc2)) == 2 # fixed + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) + doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc3)) == 2 # works + matcher = Matcher(en_vocab) + matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) + doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) + assert len(matcher(doc4)) == 3 # fixed + + @pytest.mark.parametrize( "pattern,re_pattern", [ diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 478949601..f893d81f8 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,8 +1,125 @@ import pytest import srsly from mock import Mock -from spacy.matcher import PhraseMatcher + +from spacy.lang.en import English +from spacy.matcher import PhraseMatcher, Matcher from spacy.tokens import Doc, Span +from spacy.vocab import Vocab + + +from ..util import make_tempdir + + +@pytest.mark.issue(3248) +def test_issue3248_1(): + """Test that the PhraseMatcher correctly reports its number of rules, not + total number of patterns.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) + matcher.add("TEST2", [nlp("d")]) + assert len(matcher) == 2 + + +@pytest.mark.issue(3331) +def test_issue3331(en_vocab): + """Test that duplicate patterns for different rules result in multiple + matches, one per rule. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])]) + matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])]) + doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) + matches = matcher(doc) + assert len(matches) == 2 + match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] + assert sorted(match_ids) == ["A", "B"] + + +@pytest.mark.issue(3972) +def test_issue3972(en_vocab): + """Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" + matcher = PhraseMatcher(en_vocab) + matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) + matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) + doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) + matches = matcher(doc) + + assert len(matches) == 2 + + # We should have a match for each of the two rules + found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] + assert "A" in found_ids + assert "B" in found_ids + + +@pytest.mark.issue(4002) +def test_issue4002(en_vocab): + """Test that the PhraseMatcher can match on overwritten NORM attributes.""" + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern1 = Doc(en_vocab, words=["c", "d"]) + assert [t.norm_ for t in pattern1] == ["c", "d"] + matcher.add("TEST", [pattern1]) + doc = Doc(en_vocab, words=["a", "b", "c", "d"]) + assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] + matches = matcher(doc) + assert len(matches) == 1 + matcher = PhraseMatcher(en_vocab, attr="NORM") + pattern2 = Doc(en_vocab, words=["1", "2"]) + pattern2[0].norm_ = "c" + pattern2[1].norm_ = "d" + assert [t.norm_ for t in pattern2] == ["c", "d"] + matcher.add("TEST", [pattern2]) + matches = matcher(doc) + assert len(matches) == 1 + + +@pytest.mark.issue(4373) +def test_issue4373(): + """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" + matcher = Matcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + matcher = PhraseMatcher(Vocab()) + assert isinstance(matcher.vocab, Vocab) + + +@pytest.mark.issue(4651) +def test_issue4651_with_phrase_matcher_attr(): + """Test that the EntityRuler PhraseMatcher is deserialized correctly using + the method from_disk when the EntityRuler argument phrase_matcher_attr is + specified. + """ + text = "Spacy is a python library for nlp" + nlp = English() + patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] + ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + ruler.add_patterns(patterns) + doc = nlp(text) + res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] + nlp_reloaded = English() + with make_tempdir() as d: + file_path = d / "entityruler" + ruler.to_disk(file_path) + nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) + doc_reloaded = nlp_reloaded(text) + res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] + assert res == res_reloaded + + +@pytest.mark.issue(6839) +def test_issue6839(en_vocab): + """Ensure that PhraseMatcher accepts Span as input""" + # fmt: off + words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] + # fmt: on + doc = Doc(en_vocab, words=words) + span = doc[:8] + pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) + matcher = PhraseMatcher(en_vocab) + matcher.add("SPACY", [pattern]) + matches = matcher(span) + assert matches def test_matcher_phrase_matcher(en_vocab): diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index cba6fa81e..bb226f9c5 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -40,6 +40,28 @@ def arc_eager(vocab): return moves +@pytest.mark.issue(7056) +def test_issue7056(): + """Test that the Unshift transition works properly, and doesn't cause + sentence segmentation errors.""" + vocab = Vocab() + ae = ArcEager( + vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) + ) + doc = Doc(vocab, words="Severe pain , after trauma".split()) + state = ae.init_batch([doc])[0] + ae.apply_transition(state, "S") + ae.apply_transition(state, "L-amod") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "S") + ae.apply_transition(state, "R-pobj") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + ae.apply_transition(state, "D") + assert not state.eol() + + def test_oracle_four_words(arc_eager, vocab): words = ["a", "b", "c", "d"] heads = [1, 1, 3, 3] diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 21094bcb1..b3b29d1f9 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,13 +1,16 @@ +import random + import pytest from numpy.testing import assert_equal -from spacy.attrs import ENT_IOB +from spacy.attrs import ENT_IOB from spacy import util, registry from spacy.lang.en import English +from spacy.lang.it import Italian from spacy.language import Language from spacy.lookups import Lookups from spacy.pipeline._parser_internals.ner import BiluoPushDown -from spacy.training import Example +from spacy.training import Example, iob_to_biluo from spacy.tokens import Doc, Span from spacy.vocab import Vocab import logging @@ -58,6 +61,152 @@ def tsys(vocab, entity_types): return BiluoPushDown(vocab.strings, actions) +@pytest.mark.parametrize("label", ["U-JOB-NAME"]) +@pytest.mark.issue(1967) +def test_issue1967(label): + nlp = Language() + config = {} + ner = nlp.create_pipe("ner", config=config) + example = Example.from_dict( + Doc(ner.vocab, words=["word"]), + { + "ids": [0], + "words": ["word"], + "tags": ["tag"], + "heads": [0], + "deps": ["dep"], + "entities": [label], + }, + ) + assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] + + +@pytest.mark.issue(2179) +def test_issue2179(): + """Test that spurious 'extra_labels' aren't created when initializing NER.""" + nlp = Italian() + ner = nlp.add_pipe("ner") + ner.add_label("CITIZENSHIP") + nlp.initialize() + nlp2 = Italian() + nlp2.add_pipe("ner") + assert len(nlp2.get_pipe("ner").labels) == 0 + model = nlp2.get_pipe("ner").model + model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves) + nlp2.from_bytes(nlp.to_bytes()) + assert "extra_labels" not in nlp2.get_pipe("ner").cfg + assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) + + +@pytest.mark.issue(2385) +def test_issue2385(): + """Test that IOB tags are correctly converted to BILUO tags.""" + # fix bug in labels with a 'b' character + tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") + assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] + # maintain support for iob1 format + tags2 = ("I-ORG", "I-ORG", "B-ORG") + assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] + # maintain support for iob2 format + tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") + assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"] + + +@pytest.mark.issue(2800) +def test_issue2800(): + """Test issue that arises when too many labels are added to NER model. + Used to cause segfault. + """ + nlp = English() + train_data = [] + train_data.extend( + [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] + ) + entity_types = [str(i) for i in range(1000)] + ner = nlp.add_pipe("ner") + for entity_type in list(entity_types): + ner.add_label(entity_type) + optimizer = nlp.initialize() + for i in range(20): + losses = {} + random.shuffle(train_data) + for example in train_data: + nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) + + +@pytest.mark.issue(3209) +def test_issue3209(): + """Test issue that occurred in spaCy nightly where NER labels were being + mapped to classes incorrectly after loading the model, when the labels + were added using ner.add_label(). + """ + nlp = English() + ner = nlp.add_pipe("ner") + ner.add_label("ANIMAL") + nlp.initialize() + move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] + assert ner.move_names == move_names + nlp2 = English() + ner2 = nlp2.add_pipe("ner") + model = ner2.model + model.attrs["resize_output"](model, ner.moves.n_moves) + nlp2.from_bytes(nlp.to_bytes()) + assert ner2.move_names == move_names + + +@pytest.mark.issue(4267) +def test_issue4267(): + """Test that running an entity_ruler after ner gives consistent results""" + nlp = English() + ner = nlp.add_pipe("ner") + ner.add_label("PEOPLE") + nlp.initialize() + assert "ner" in nlp.pipe_names + # assert that we have correct IOB annotations + doc1 = nlp("hi") + assert doc1.has_annotation("ENT_IOB") + for token in doc1: + assert token.ent_iob == 2 + # add entity ruler and run again + patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + assert "entity_ruler" in nlp.pipe_names + assert "ner" in nlp.pipe_names + # assert that we still have correct IOB annotations + doc2 = nlp("hi") + assert doc2.has_annotation("ENT_IOB") + for token in doc2: + assert token.ent_iob == 2 + + +@pytest.mark.issue(4313) +def test_issue4313(): + """This should not crash or exit with some strange error code""" + beam_width = 16 + beam_density = 0.0001 + nlp = English() + config = { + "beam_width": beam_width, + "beam_density": beam_density, + } + ner = nlp.add_pipe("beam_ner", config=config) + ner.add_label("SOME_LABEL") + nlp.initialize() + # add a new label to the doc + doc = nlp("What do you think about Apple ?") + assert len(ner.labels) == 1 + assert "SOME_LABEL" in ner.labels + apple_ent = Span(doc, 5, 6, label="MY_ORG") + doc.ents = list(doc.ents) + [apple_ent] + + # ensure the beam_parse still works with the new label + docs = [doc] + ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density) + assert len(ner.labels) == 2 + assert "MY_ORG" in ner.labels + + def test_get_oracle_moves(tsys, doc, entity_annots): example = Example.from_dict(doc, {"entities": entity_annots}) act_classes = tsys.get_oracle_sequence(example, _debug=False) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index b7575d063..7bbb30d8e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,15 +1,17 @@ import pytest from numpy.testing import assert_equal -from spacy.attrs import DEP +from thinc.api import Adam +from spacy import registry, util +from spacy.attrs import DEP, NORM from spacy.lang.en import English -from spacy.training import Example from spacy.tokens import Doc -from spacy import util, registry +from spacy.training import Example +from spacy.vocab import Vocab -from ..util import apply_transition_sequence, make_tempdir from ...pipeline import DependencyParser from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL +from ..util import apply_transition_sequence, make_tempdir TRAIN_DATA = [ ( @@ -59,6 +61,94 @@ PARTIAL_DATA = [ eps = 0.1 +@pytest.fixture +def vocab(): + return Vocab(lex_attr_getters={NORM: lambda s: s}) + + +@pytest.fixture +def parser(vocab): + vocab.strings.add("ROOT") + cfg = {"model": DEFAULT_PARSER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + parser = DependencyParser(vocab, model) + parser.cfg["token_vector_width"] = 4 + parser.cfg["hidden_width"] = 32 + # parser.add_label('right') + parser.add_label("left") + parser.initialize(lambda: [_parser_example(parser)]) + sgd = Adam(0.001) + + for i in range(10): + losses = {} + doc = Doc(vocab, words=["a", "b", "c", "d"]) + example = Example.from_dict( + doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} + ) + parser.update([example], sgd=sgd, losses=losses) + return parser + + +def _parser_example(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} + return Example.from_dict(doc, gold) + + +@pytest.mark.issue(2772) +def test_issue2772(en_vocab): + """Test that deprojectivization doesn't mess up sentence boundaries.""" + # fmt: off + words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."] + # fmt: on + # A tree with a non-projective (i.e. crossing) arc + # The arcs (0, 4) and (2, 9) cross. + heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9] + deps = ["dep"] * len(heads) + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + assert doc[1].is_sent_start is False + + +@pytest.mark.issue(3830) +def test_issue3830_no_subtok(): + """Test that the parser doesn't have subtok label if not learn_tokens""" + config = { + "learn_tokens": False, + } + model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] + parser = DependencyParser(Vocab(), model, **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.initialize(lambda: [_parser_example(parser)]) + assert "subtok" not in parser.labels + + +@pytest.mark.issue(3830) +def test_issue3830_with_subtok(): + """Test that the parser does have subtok label if learn_tokens=True.""" + config = { + "learn_tokens": True, + } + model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] + parser = DependencyParser(Vocab(), model, **config) + parser.add_label("nsubj") + assert "subtok" not in parser.labels + parser.initialize(lambda: [_parser_example(parser)]) + assert "subtok" in parser.labels + + +@pytest.mark.issue(7716) +@pytest.mark.xfail(reason="Not fixed yet") +def test_partial_annotation(parser): + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + doc[2].is_sent_start = False + # Note that if the following line is used, then doc[2].is_sent_start == False + # doc[3].is_sent_start = False + + doc = parser(doc) + assert doc[2].is_sent_start == False + + def test_parser_root(en_vocab): words = ["i", "do", "n't", "have", "other", "assistance"] heads = [3, 3, 3, 3, 5, 3] diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index a98d01964..3740e430e 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,18 +1,20 @@ from typing import Callable, Iterable + import pytest from numpy.testing import assert_equal + +from spacy import registry, util from spacy.attrs import ENT_KB_ID from spacy.compat import pickle -from spacy.kb import KnowledgeBase, get_candidates, Candidate -from spacy.vocab import Vocab - -from spacy import util, registry +from spacy.kb import Candidate, KnowledgeBase, get_candidates +from spacy.lang.en import English from spacy.ml import load_kb from spacy.scorer import Scorer -from spacy.training import Example -from spacy.lang.en import English from spacy.tests.util import make_tempdir from spacy.tokens import Span +from spacy.training import Example +from spacy.util import ensure_path +from spacy.vocab import Vocab @pytest.fixture @@ -25,6 +27,198 @@ def assert_almost_equal(a, b): assert a - delta <= b <= a + delta +@pytest.mark.issue(4674) +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) + assert kb.get_size_entities() == 1 + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.to_disk(str(file_path)) + kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) + kb2.from_disk(str(file_path)) + assert kb2.get_size_entities() == 1 + + +@pytest.mark.issue(6730) +def test_issue6730(en_vocab): + """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" + from spacy.kb import KnowledgeBase + + kb = KnowledgeBase(en_vocab, entity_vector_length=3) + kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) + + with pytest.raises(ValueError): + kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) + assert kb.contains_alias("") is False + + kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) + kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) + + with make_tempdir() as tmp_dir: + kb.to_disk(tmp_dir) + kb.from_disk(tmp_dir) + assert kb.get_size_aliases() == 2 + assert set(kb.get_alias_strings()) == {"x", "y"} + + +@pytest.mark.issue(7065) +def test_issue7065(): + text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." + nlp = English() + nlp.add_pipe("sentencizer") + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + { + "label": "THING", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + } + ] + ruler.add_patterns(patterns) + + doc = nlp(text) + sentences = [s for s in doc.sents] + assert len(sentences) == 2 + sent0 = sentences[0] + ent = doc.ents[0] + assert ent.start < sent0.end < ent.end + assert sentences.index(ent.sent) == 0 + + +@pytest.mark.issue(7065) +def test_issue7065_b(): + # Test that the NEL doesn't crash when an entity crosses a sentence boundary + nlp = English() + vector_length = 3 + nlp.add_pipe("sentencizer") + text = "Mahler 's Symphony No. 8 was beautiful." + entities = [(0, 6, "PERSON"), (10, 24, "WORK")] + links = { + (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, + (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, + } + sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] + doc = nlp(text) + example = Example.from_dict( + doc, {"entities": entities, "links": links, "sent_starts": sent_starts} + ) + train_examples = [example] + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="No. 8", + entities=["Q270853"], + probabilities=[1.0], + ) + mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias( + alias="Mahler", + entities=["Q7304"], + probabilities=[1.0], + ) + return mykb + + # Create the Entity Linker component and add it to the pipeline + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + # train the NEL pipe + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # Add a custom rule-based component to mimick NER + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, + { + "label": "WORK", + "pattern": [ + {"LOWER": "symphony"}, + {"LOWER": "no"}, + {"LOWER": "."}, + {"LOWER": "8"}, + ], + }, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + # test the trained model - this should not throw E148 + doc = nlp(text) + assert doc + + +def test_partial_links(): + # Test that having some entities on the doc without gold links, doesn't crash + TRAIN_DATA = [ + ( + "Russ Cochran his reprints include EC Comics.", + { + "links": {(0, 12): {"Q2146908": 1.0}}, + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], + }, + ) + ] + nlp = English() + vector_length = 3 + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB + mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) + return mykb + + # Create and train the Entity Linker + entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker.set_kb(create_kb) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(2): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, + {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + + # this will run the pipeline on the examples and shouldn't crash + results = nlp.evaluate(train_examples) + assert "PERSON" in results["ents_per_type"] + assert "PERSON" in results["nel_f_per_type"] + assert "ORG" in results["ents_per_type"] + assert "ORG" not in results["nel_f_per_type"] + + def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index e66b49518..0cecafff3 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -1,9 +1,11 @@ import pytest from spacy import registry -from spacy.tokens import Span +from spacy.tokens import Doc, Span from spacy.language import Language -from spacy.pipeline import EntityRuler +from spacy.lang.en import English +from spacy.pipeline import EntityRuler, EntityRecognizer, merge_entities +from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.errors import MatchPatternError from spacy.tests.util import make_tempdir @@ -34,6 +36,117 @@ def add_ent_component(doc): return doc +@pytest.mark.issue(3345) +def test_issue3345(): + """Test case where preset entity crosses sentence boundary.""" + nlp = English() + doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) + doc[4].is_sent_start = True + ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) + cfg = {"model": DEFAULT_NER_MODEL} + model = registry.resolve(cfg, validate=True)["model"] + ner = EntityRecognizer(doc.vocab, model) + # Add the OUT action. I wouldn't have thought this would be necessary... + ner.moves.add_action(5, "") + ner.add_label("GPE") + doc = ruler(doc) + # Get into the state just before "New" + state = ner.moves.init_batch([doc])[0] + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + # Check that B-GPE is valid. + assert ner.moves.is_valid(state, "B-GPE") + + +@pytest.mark.issue(4849) +def test_issue4849(): + nlp = English() + patterns = [ + {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, + {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, + ] + ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) + ruler.add_patterns(patterns) + text = """ + The left is starting to take aim at Democratic front-runner Joe Biden. + Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." + """ + # USING 1 PROCESS + count_ents = 0 + for doc in nlp.pipe([text], n_process=1): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + # USING 2 PROCESSES + if isinstance(get_current_ops, NumpyOps): + count_ents = 0 + for doc in nlp.pipe([text], n_process=2): + count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) + assert count_ents == 2 + + +@pytest.mark.issue(5918) +def test_issue5918(): + # Test edge case when merging entities. + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + {"label": "ORG", "pattern": "Digicon Inc"}, + {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, + {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"}, + ] + ruler.add_patterns(patterns) + + text = """ + Digicon Inc said it has completed the previously-announced disposition + of its computer systems division to an investment group led by + Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate. + """ + doc = nlp(text) + assert len(doc.ents) == 3 + # make it so that the third span's head is within the entity (ent_iob=I) + # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents. + # TODO: test for logging here + # with pytest.warns(UserWarning): + # doc[29].head = doc[33] + doc = merge_entities(doc) + assert len(doc.ents) == 3 + + +@pytest.mark.issue(8168) +def test_issue8168(): + nlp = English() + ruler = nlp.add_pipe("entity_ruler") + patterns = [ + {"label": "ORG", "pattern": "Apple"}, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], + "id": "san-francisco", + }, + { + "label": "GPE", + "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], + "id": "san-francisco", + }, + ] + ruler.add_patterns(patterns) + + assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} + + +@pytest.mark.issue(8216) +def test_entity_ruler_fix8216(nlp, patterns): + """Test that patterns don't get added excessively.""" + ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) + ruler.add_patterns(patterns) + pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert pattern_count > 0 + ruler.add_patterns([]) + after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) + assert after_count == pattern_count + + def test_entity_ruler_init(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 0c2554727..4128e2a48 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -1,4 +1,6 @@ import pytest + +import spacy from spacy.language import Language from spacy.lang.en import English from spacy.lang.de import German @@ -11,6 +13,37 @@ from pydantic import StrictInt, StrictStr from ..util import make_tempdir +@pytest.mark.issue(5137) +def test_issue5137(): + factory_name = "test_issue5137" + pipe_name = "my_component" + + @Language.factory(factory_name) + class MyComponent: + def __init__(self, nlp, name=pipe_name, categories="all_categories"): + self.nlp = nlp + self.categories = categories + self.name = name + + def __call__(self, doc): + pass + + def to_disk(self, path, **kwargs): + pass + + def from_disk(self, path, **cfg): + pass + + nlp = English() + my_component = nlp.add_pipe(factory_name, name=pipe_name) + assert my_component.categories == "all_categories" + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + overrides = {"components": {pipe_name: {"categories": "my_categories"}}} + nlp2 = spacy.load(tmpdir, config=overrides) + assert nlp2.get_pipe(pipe_name).categories == "my_categories" + + def test_pipe_function_component(): name = "test_component" diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 87fd64307..4b8fb8ebc 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,9 +1,17 @@ +import gc + +import numpy import pytest +from thinc.api import get_current_ops + +from spacy.lang.en import English +from spacy.lang.en.syntax_iterators import noun_chunks from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.tokens import Doc from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names -from spacy.lang.en import English +from spacy.vocab import Vocab @pytest.fixture @@ -21,6 +29,138 @@ def other_pipe(doc): return doc +@pytest.mark.issue(1506) +def test_issue1506(): + def string_generator(): + for _ in range(10001): + yield "It's sentence produced by that bug." + for _ in range(10001): + yield "I erase some hbdsaj lemmas." + for _ in range(10001): + yield "I erase lemmas." + for _ in range(10001): + yield "It's sentence produced by that bug." + for _ in range(10001): + yield "It's sentence produced by that bug." + + nlp = English() + for i, d in enumerate(nlp.pipe(string_generator())): + # We should run cleanup more than one time to actually cleanup data. + # In first run — clean up only mark strings as «not hitted». + if i == 10000 or i == 20000 or i == 30000: + gc.collect() + for t in d: + str(t.lemma_) + + +@pytest.mark.issue(1654) +def test_issue1654(): + nlp = Language(Vocab()) + assert not nlp.pipeline + + @Language.component("component") + def component(doc): + return doc + + nlp.add_pipe("component", name="1") + nlp.add_pipe("component", name="2", after="1") + nlp.add_pipe("component", name="3", after="2") + assert nlp.pipe_names == ["1", "2", "3"] + nlp2 = Language(Vocab()) + assert not nlp2.pipeline + nlp2.add_pipe("component", name="3") + nlp2.add_pipe("component", name="2", before="3") + nlp2.add_pipe("component", name="1", before="2") + assert nlp2.pipe_names == ["1", "2", "3"] + + +@pytest.mark.issue(3880) +def test_issue3880(): + """Test that `nlp.pipe()` works when an empty string ends the batch. + + Fixed in v7.0.5 of Thinc. + """ + texts = ["hello", "world", "", ""] + nlp = English() + nlp.add_pipe("parser").add_label("dep") + nlp.add_pipe("ner").add_label("PERSON") + nlp.add_pipe("tagger").add_label("NN") + nlp.initialize() + for doc in nlp.pipe(texts): + pass + + +@pytest.mark.issue(5082) +def test_issue5082(): + # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens + nlp = English() + vocab = nlp.vocab + array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32) + array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32) + array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32) + array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32) + array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32) + vocab.set_vector("I", array1) + vocab.set_vector("like", array2) + vocab.set_vector("David", array3) + vocab.set_vector("Bowie", array4) + text = "I like David Bowie" + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]} + ] + ruler = nlp.add_pipe("entity_ruler") + ruler.add_patterns(patterns) + parsed_vectors_1 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_1) == 4 + ops = get_current_ops() + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4) + nlp.add_pipe("merge_entities") + parsed_vectors_2 = [t.vector for t in nlp(text)] + assert len(parsed_vectors_2) == 3 + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2) + numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) + + +@pytest.mark.issue(5458) +def test_issue5458(): + # Test that the noun chuncker does not generate overlapping spans + # fmt: off + words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] + vocab = Vocab(strings=words) + deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] + pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] + heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] + # fmt: on + en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps) + en_doc.noun_chunks_iterator = noun_chunks + + # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" + nlp = English() + merge_nps = nlp.create_pipe("merge_noun_chunks") + merge_nps(en_doc) + + +def test_multiple_predictions(): + class DummyPipe(TrainablePipe): + def __init__(self): + self.model = "dummy_model" + + def predict(self, docs): + return ([1, 2, 3], [4, 5, 6]) + + def set_annotations(self, docs, scores): + return docs + + nlp = Language() + doc = nlp.make_doc("foo") + dummy_pipe = DummyPipe() + dummy_pipe(doc) + + def test_add_pipe_no_name(nlp): nlp.add_pipe("new_pipe") assert "new_pipe" in nlp.pipe_names diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index ec14b70da..96e75851e 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -6,10 +6,27 @@ from spacy import util from spacy.training import Example from spacy.lang.en import English from spacy.language import Language +from thinc.api import compounding from ..util import make_tempdir +@pytest.mark.issue(4348) +def test_issue4348(): + """Test that training the tagger with empty data, doesn't throw errors""" + nlp = English() + example = Example.from_dict(nlp.make_doc(""), {"tags": []}) + TRAIN_DATA = [example, example] + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") + optimizer = nlp.initialize() + for i in range(5): + losses = {} + batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + nlp.update(batch, sgd=optimizer, losses=losses) + + def test_label_types(): nlp = Language() tagger = nlp.add_pipe("tagger") diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index b134b8508..282789f2b 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,20 +1,31 @@ -import pytest import random + import numpy.random +import pytest from numpy.testing import assert_almost_equal -from thinc.api import fix_random_seed +from thinc.api import Config, compounding, fix_random_seed, get_current_ops +from wasabi import msg + +import spacy from spacy import util +from spacy.cli.evaluate import print_prf_per_type, print_textcats_auc_per_cat from spacy.lang.en import English from spacy.language import Language from spacy.pipeline import TextCategorizer -from spacy.tokens import Doc +from spacy.pipeline.textcat import single_label_bow_config +from spacy.pipeline.textcat import single_label_cnn_config +from spacy.pipeline.textcat import single_label_default_config +from spacy.pipeline.textcat_multilabel import multi_label_bow_config +from spacy.pipeline.textcat_multilabel import multi_label_cnn_config +from spacy.pipeline.textcat_multilabel import multi_label_default_config from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.scorer import Scorer +from spacy.tokens import Doc, DocBin from spacy.training import Example +from spacy.training.initialize import init_nlp from ..util import make_tempdir - TRAIN_DATA_SINGLE_LABEL = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), @@ -48,6 +59,224 @@ def make_get_examples_multi_label(nlp): return get_examples +@pytest.mark.issue(3611) +def test_issue3611(): + """Test whether adding n-grams in the textcat works even when n > token length of some docs""" + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + model = { + "@architectures": "spacy.TextCatBOW.v1", + "exclusive_classes": True, + "ngram_size": 2, + "no_output_layer": False, + } + textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) + for label in unique_classes: + textcat.add_label(label) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.initialize() + for i in range(3): + losses = {} + batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) + + +@pytest.mark.issue(4030) +def test_issue4030(): + """Test whether textcat works fine with empty doc""" + unique_classes = ["offensive", "inoffensive"] + x_train = [ + "This is an offensive text", + "This is the second offensive text", + "inoff", + ] + y_train = ["offensive", "offensive", "inoffensive"] + nlp = spacy.blank("en") + # preparing the data + train_data = [] + for text, train_instance in zip(x_train, y_train): + cat_dict = {label: label == train_instance for label in unique_classes} + train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) + # add a text categorizer component + model = { + "@architectures": "spacy.TextCatBOW.v1", + "exclusive_classes": True, + "ngram_size": 2, + "no_output_layer": False, + } + textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) + for label in unique_classes: + textcat.add_label(label) + # training the network + with nlp.select_pipes(enable="textcat"): + optimizer = nlp.initialize() + for i in range(3): + losses = {} + batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) + # processing of an empty doc should result in 0.0 for all categories + doc = nlp("") + assert doc.cats["offensive"] == 0.0 + assert doc.cats["inoffensive"] == 0.0 + + +@pytest.mark.parametrize( + "textcat_config", + [ + single_label_default_config, + single_label_bow_config, + single_label_cnn_config, + multi_label_default_config, + multi_label_bow_config, + multi_label_cnn_config, + ], +) +@pytest.mark.issue(5551) +def test_issue5551(textcat_config): + """Test that after fixing the random seed, the results of the pipeline are truly identical""" + component = "textcat" + + pipe_cfg = Config().from_str(textcat_config) + results = [] + for i in range(3): + fix_random_seed(0) + nlp = English() + text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g." + annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}} + pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) + for label in set(annots["cats"]): + pipe.add_label(label) + # Train + nlp.initialize() + doc = nlp.make_doc(text) + nlp.update([Example.from_dict(doc, annots)]) + # Store the result of each iteration + result = pipe.model.predict([doc]) + results.append(result[0]) + # All results should be the same because of the fixed seed + assert len(results) == 3 + ops = get_current_ops() + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) + assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) + + +CONFIG_ISSUE_6908 = """ +[paths] +train = "TRAIN_PLACEHOLDER" +raw = null +init_tok2vec = null +vectors = null + +[system] +seed = 0 +gpu_allocator = null + +[nlp] +lang = "en" +pipeline = ["textcat"] +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +batch_size = 1000 + +[components] + +[components.textcat] +factory = "TEXTCAT_PLACEHOLDER" + +[corpora] + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths:train} + + +[training] +train_corpus = "corpora.train" +dev_corpus = "corpora.dev" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +frozen_components = [] +before_to_disk = null + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.components.textcat] +labels = ['label1', 'label2'] + +[initialize.tokenizer] +""" + + +@pytest.mark.parametrize( + "component_name", + ["textcat", "textcat_multilabel"], +) +@pytest.mark.issue(6908) +def test_issue6908(component_name): + """Test intializing textcat with labels in a list""" + + def create_data(out_file): + nlp = spacy.blank("en") + doc = nlp.make_doc("Some text") + doc.cats = {"label1": 0, "label2": 1} + out_data = DocBin(docs=[doc]).to_bytes() + with out_file.open("wb") as file_: + file_.write(out_data) + + with make_tempdir() as tmp_path: + train_path = tmp_path / "train.spacy" + create_data(train_path) + config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) + config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) + config = util.load_config_from_str(config_str) + init_nlp(config) + + +@pytest.mark.issue(7019) +def test_issue7019(): + scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} + print_textcats_auc_per_cat(msg, scores) + scores = { + "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, + "LABEL_B": {"p": None, "r": None, "f": None}, + } + print_prf_per_type(msg, scores, name="foo", type="bar") + + @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): nlp = Language() diff --git a/spacy/tests/regression/__init__.py b/spacy/tests/regression/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py deleted file mode 100644 index 4846d2075..000000000 --- a/spacy/tests/regression/test_issue1-1000.py +++ /dev/null @@ -1,486 +0,0 @@ -import pytest -import random -from spacy import util -from spacy.training import Example -from spacy.matcher import Matcher -from spacy.attrs import IS_PUNCT, ORTH, LOWER -from spacy.vocab import Vocab -from spacy.lang.en import English -from spacy.lookups import Lookups -from spacy.tokens import Doc, Span - -from ..util import make_tempdir - - -@pytest.mark.issue(118) -@pytest.mark.parametrize( - "patterns", - [ - [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], - [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]], - ], -) -def test_issue118(en_tokenizer, patterns): - """Test a bug that arose from having overlapping matches""" - text = ( - "how many points did lebron james score against the boston celtics last night" - ) - doc = en_tokenizer(text) - ORG = doc.vocab.strings["ORG"] - matcher = Matcher(doc.vocab) - matcher.add("BostonCeltics", patterns) - assert len(list(doc.ents)) == 0 - matches = [(ORG, start, end) for _, start, end in matcher(doc)] - assert matches == [(ORG, 9, 11), (ORG, 10, 11)] - doc.ents = matches[:1] - ents = list(doc.ents) - assert len(ents) == 1 - assert ents[0].label == ORG - assert ents[0].start == 9 - assert ents[0].end == 11 - - -@pytest.mark.issue(118) -@pytest.mark.parametrize( - "patterns", - [ - [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], - [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]], - ], -) -def test_issue118_prefix_reorder(en_tokenizer, patterns): - """Test a bug that arose from having overlapping matches""" - text = ( - "how many points did lebron james score against the boston celtics last night" - ) - doc = en_tokenizer(text) - ORG = doc.vocab.strings["ORG"] - matcher = Matcher(doc.vocab) - matcher.add("BostonCeltics", patterns) - assert len(list(doc.ents)) == 0 - matches = [(ORG, start, end) for _, start, end in matcher(doc)] - doc.ents += tuple(matches)[1:] - assert matches == [(ORG, 9, 10), (ORG, 9, 11)] - ents = doc.ents - assert len(ents) == 1 - assert ents[0].label == ORG - assert ents[0].start == 9 - assert ents[0].end == 11 - - -@pytest.mark.issue(242) -def test_issue242(en_tokenizer): - """Test overlapping multi-word phrases.""" - text = "There are different food safety standards in different countries." - patterns = [ - [{"LOWER": "food"}, {"LOWER": "safety"}], - [{"LOWER": "safety"}, {"LOWER": "standards"}], - ] - doc = en_tokenizer(text) - matcher = Matcher(doc.vocab) - matcher.add("FOOD", patterns) - matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] - match1, match2 = matches - assert match1[1] == 3 - assert match1[2] == 5 - assert match2[1] == 4 - assert match2[2] == 6 - with pytest.raises(ValueError): - # One token can only be part of one entity, so test that the matches - # can't be added as entities - doc.ents += tuple(matches) - - -@pytest.mark.issue(309) -def test_issue309(en_vocab): - """Test Issue #309: SBD fails on empty string""" - doc = Doc(en_vocab, words=[" "], heads=[0], deps=["ROOT"]) - assert len(doc) == 1 - sents = list(doc.sents) - assert len(sents) == 1 - - -@pytest.mark.issue(351) -def test_issue351(en_tokenizer): - doc = en_tokenizer(" This is a cat.") - assert doc[0].idx == 0 - assert len(doc[0]) == 3 - assert doc[1].idx == 3 - - -@pytest.mark.issue(360) -def test_issue360(en_tokenizer): - """Test tokenization of big ellipsis""" - tokens = en_tokenizer("$45...............Asking") - assert len(tokens) > 2 - - -@pytest.mark.issue(361) -@pytest.mark.parametrize("text1,text2", [("cat", "dog")]) -def test_issue361(en_vocab, text1, text2): - """Test Issue #361: Equality of lexemes""" - assert en_vocab[text1] == en_vocab[text1] - assert en_vocab[text1] != en_vocab[text2] - - -@pytest.mark.issue(587) -def test_issue587(en_tokenizer): - """Test that Matcher doesn't segfault on particular input""" - doc = en_tokenizer("a b; c") - matcher = Matcher(doc.vocab) - matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]]) - matches = matcher(doc) - assert len(matches) == 1 - matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]]) - matches = matcher(doc) - assert len(matches) == 2 - matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]]) - matches = matcher(doc) - assert len(matches) == 2 - - -@pytest.mark.issue(588) -def test_issue588(en_vocab): - matcher = Matcher(en_vocab) - with pytest.raises(ValueError): - matcher.add("TEST", [[]]) - - -@pytest.mark.issue(590) -def test_issue590(en_vocab): - """Test overlapping matches""" - doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) - matcher = Matcher(en_vocab) - matcher.add( - "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]] - ) - matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]]) - matches = matcher(doc) - assert len(matches) == 2 - - -@pytest.mark.issue(595) -@pytest.mark.skip(reason="Old vocab-based lemmatization") -def test_issue595(): - """Test lemmatization of base forms""" - words = ["Do", "n't", "feed", "the", "dog"] - lookups = Lookups() - lookups.add_table("lemma_rules", {"verb": [["ed", "e"]]}) - lookups.add_table("lemma_index", {"verb": {}}) - lookups.add_table("lemma_exc", {"verb": {}}) - vocab = Vocab() - doc = Doc(vocab, words=words) - doc[2].tag_ = "VB" - assert doc[2].text == "feed" - assert doc[2].lemma_ == "feed" - - -@pytest.mark.issue(599) -def test_issue599(en_vocab): - doc = Doc(en_vocab) - doc2 = Doc(doc.vocab) - doc2.from_bytes(doc.to_bytes()) - assert doc2.has_annotation("DEP") - - -@pytest.mark.issue(600) -def test_issue600(): - vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) - doc = Doc(vocab, words=["hello"]) - doc[0].tag_ = "NN" - - -@pytest.mark.issue(615) -def test_issue615(en_tokenizer): - def merge_phrases(matcher, doc, i, matches): - """Merge a phrase. We have to be careful here because we'll change the - token indices. To avoid problems, merge all the phrases once we're called - on the last match.""" - if i != len(matches) - 1: - return None - spans = [Span(doc, start, end, label=label) for label, start, end in matches] - with doc.retokenize() as retokenizer: - for span in spans: - tag = "NNP" if span.label_ else span.root.tag_ - attrs = {"tag": tag, "lemma": span.text} - retokenizer.merge(span, attrs=attrs) - doc.ents = doc.ents + (span,) - - text = "The golf club is broken" - pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] - label = "Sport_Equipment" - doc = en_tokenizer(text) - matcher = Matcher(doc.vocab) - matcher.add(label, [pattern], on_match=merge_phrases) - matcher(doc) - entities = list(doc.ents) - assert entities != [] - assert entities[0].label != 0 - - -@pytest.mark.issue(736) -@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) -def test_issue736(en_tokenizer, text, number): - """Test that times like "7am" are tokenized correctly and that numbers are - converted to string.""" - tokens = en_tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text == number - - -@pytest.mark.issue(740) -@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) -def test_issue740(en_tokenizer, text): - """Test that dates are not split and kept as one token. This behaviour is - currently inconsistent, since dates separated by hyphens are still split. - This will be hard to prevent without causing clashes with numeric ranges.""" - tokens = en_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.issue(743) -def test_issue743(): - doc = Doc(Vocab(), ["hello", "world"]) - token = doc[0] - s = set([token]) - items = list(s) - assert items[0] is token - - -@pytest.mark.issue(744) -@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) -def test_issue744(en_tokenizer, text): - """Test that 'were' and 'Were' are excluded from the contractions - generated by the English tokenizer exceptions.""" - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text.lower() == "were" - - -@pytest.mark.issue(759) -@pytest.mark.parametrize( - "text,is_num", [("one", True), ("ten", True), ("teneleven", False)] -) -def test_issue759(en_tokenizer, text, is_num): - tokens = en_tokenizer(text) - assert tokens[0].like_num == is_num - - -@pytest.mark.issue(775) -@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) -def test_issue775(en_tokenizer, text): - """Test that 'Shell' and 'shell' are excluded from the contractions - generated by the English tokenizer exceptions.""" - tokens = en_tokenizer(text) - assert len(tokens) == 1 - assert tokens[0].text == text - - -@pytest.mark.issue(792) -@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) -def test_issue792(en_tokenizer, text): - """Test for Issue #792: Trailing whitespace is removed after tokenization.""" - doc = en_tokenizer(text) - assert "".join([token.text_with_ws for token in doc]) == text - - -@pytest.mark.issue(792) -@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) -def test_control_issue792(en_tokenizer, text): - """Test base case for Issue #792: Non-trailing whitespace""" - doc = en_tokenizer(text) - assert "".join([token.text_with_ws for token in doc]) == text - - -@pytest.mark.issue(801) -@pytest.mark.skip( - reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218" -) -@pytest.mark.parametrize( - "text,tokens", - [ - ('"deserve,"--and', ['"', "deserve", ',"--', "and"]), - ("exception;--exclusive", ["exception", ";--", "exclusive"]), - ("day.--Is", ["day", ".--", "Is"]), - ("refinement:--just", ["refinement", ":--", "just"]), - ("memories?--To", ["memories", "?--", "To"]), - ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]), - ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]), - ], -) -def test_issue801(en_tokenizer, text, tokens): - """Test that special characters + hyphens are split correctly.""" - doc = en_tokenizer(text) - assert len(doc) == len(tokens) - assert [t.text for t in doc] == tokens - - -@pytest.mark.issue(805) -@pytest.mark.parametrize( - "text,expected_tokens", - [ - ( - "Smörsåsen används bl.a. till fisk", - ["Smörsåsen", "används", "bl.a.", "till", "fisk"], - ), - ( - "Jag kommer först kl. 13 p.g.a. diverse förseningar", - ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], - ), - ], -) -def test_issue805(sv_tokenizer, text, expected_tokens): - tokens = sv_tokenizer(text) - token_list = [token.text for token in tokens if not token.is_space] - assert expected_tokens == token_list - - -@pytest.mark.issue(850) -def test_issue850(): - """The variable-length pattern matches the succeeding token. Check we - handle the ambiguity correctly.""" - vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) - matcher = Matcher(vocab) - pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] - matcher.add("FarAway", [pattern]) - doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) - match = matcher(doc) - assert len(match) == 1 - ent_id, start, end = match[0] - assert start == 0 - assert end == 4 - - -@pytest.mark.issue(850) -def test_issue850_basic(): - """Test Matcher matches with '*' operator and Boolean flag""" - vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) - matcher = Matcher(vocab) - pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] - matcher.add("FarAway", [pattern]) - doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) - match = matcher(doc) - assert len(match) == 1 - ent_id, start, end = match[0] - assert start == 0 - assert end == 4 - - -@pytest.mark.issue(852) -@pytest.mark.skip( - reason="French exception list is not enabled in the default tokenizer anymore" -) -@pytest.mark.parametrize( - "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"] -) -def test_issue852(fr_tokenizer, text): - """Test that French tokenizer exceptions are imported correctly.""" - tokens = fr_tokenizer(text) - assert len(tokens) == 1 - - -@pytest.mark.issue(859) -@pytest.mark.parametrize( - "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] -) -def test_issue859(en_tokenizer, text): - """Test that no extra space is added in doc.text method.""" - doc = en_tokenizer(text) - assert doc.text == text - - -@pytest.mark.issue(886) -@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) -def test_issue886(en_tokenizer, text): - """Test that token.idx matches the original text index for texts with newlines.""" - doc = en_tokenizer(text) - for token in doc: - assert len(token.text) == len(token.text_with_ws) - assert text[token.idx] == token.text[0] - - -@pytest.mark.issue(891) -@pytest.mark.parametrize("text", ["want/need"]) -def test_issue891(en_tokenizer, text): - """Test that / infixes are split correctly.""" - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "/" - - -@pytest.mark.issue(912) -@pytest.mark.skip(reason="Old vocab-based lemmatization") -@pytest.mark.parametrize( - "text,tag,lemma", - [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")], -) -def test_issue912(en_vocab, text, tag, lemma): - """Test base-forms are preserved.""" - doc = Doc(en_vocab, words=[text]) - doc[0].tag_ = tag - assert doc[0].lemma_ == lemma - - -@pytest.mark.issue(957) -@pytest.mark.slow -def test_issue957(en_tokenizer): - """Test that spaCy doesn't hang on many punctuation characters. - If this test hangs, check (new) regular expressions for conflicting greedy operators - """ - # Skip test if pytest-timeout is not installed - pytest.importorskip("pytest_timeout") - for punct in [".", ",", "'", '"', ":", "?", "!", ";", "-"]: - string = "0" - for i in range(1, 100): - string += punct + str(i) - doc = en_tokenizer(string) - assert doc - - -@pytest.mark.issue(999) -def test_issue999(): - """Test that adding entities and resuming training works passably OK. - There are two issues here: - 1) We have to re-add labels. This isn't very nice. - 2) There's no way to set the learning rate for the weight update, so we - end up out-of-scale, causing it to learn too fast. - """ - TRAIN_DATA = [ - ["hey", []], - ["howdy", []], - ["hey there", []], - ["hello", []], - ["hi", []], - ["i'm looking for a place to eat", []], - ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]], - ["show me chinese restaurants", [(8, 15, "CUISINE")]], - ["show me chines restaurants", [(8, 14, "CUISINE")]], - ] - nlp = English() - ner = nlp.add_pipe("ner") - for _, offsets in TRAIN_DATA: - for start, end, label in offsets: - ner.add_label(label) - nlp.initialize() - for itn in range(20): - random.shuffle(TRAIN_DATA) - for raw_text, entity_offsets in TRAIN_DATA: - example = Example.from_dict( - nlp.make_doc(raw_text), {"entities": entity_offsets} - ) - nlp.update([example]) - - with make_tempdir() as model_dir: - nlp.to_disk(model_dir) - nlp2 = util.load_model_from_path(model_dir) - - for raw_text, entity_offsets in TRAIN_DATA: - doc = nlp2(raw_text) - ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} - for start, end, label in entity_offsets: - if (start, end) in ents: - assert ents[(start, end)] == label - break - else: - if entity_offsets: - raise Exception(ents) diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py deleted file mode 100644 index 0a60e4477..000000000 --- a/spacy/tests/regression/test_issue1001-1500.py +++ /dev/null @@ -1,174 +0,0 @@ -import pytest -import re -from spacy.tokens import Doc -from spacy.vocab import Vocab -from spacy.lang.en import English -from spacy.lang.lex_attrs import LEX_ATTRS -from spacy.matcher import Matcher -from spacy.tokenizer import Tokenizer -from spacy.symbols import ORTH, LEMMA, POS - - -@pytest.mark.issue(1061) -def test_issue1061(): - """Test special-case works after tokenizing. Was caching problem.""" - text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." - tokenizer = English().tokenizer - doc = tokenizer(text) - assert "MATH" in [w.text for w in doc] - assert "_MATH_" not in [w.text for w in doc] - - tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) - doc = tokenizer(text) - assert "_MATH_" in [w.text for w in doc] - assert "MATH" not in [w.text for w in doc] - - # For sanity, check it works when pipeline is clean. - tokenizer = English().tokenizer - tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) - doc = tokenizer(text) - assert "_MATH_" in [w.text for w in doc] - assert "MATH" not in [w.text for w in doc] - - -@pytest.mark.skip( - reason="Can not be fixed without variable-width look-behind (which we don't want)" -) -@pytest.mark.issue(1235) -def test_issue1235(): - """Test that g is not split of if preceded by a number and a letter""" - nlp = English() - testwords = "e2g 2g 52g" - doc = nlp(testwords) - assert len(doc) == 5 - assert doc[0].text == "e2g" - assert doc[1].text == "2" - assert doc[2].text == "g" - assert doc[3].text == "52" - assert doc[4].text == "g" - - -@pytest.mark.issue(1242) -def test_issue1242(): - nlp = English() - doc = nlp("") - assert len(doc) == 0 - docs = list(nlp.pipe(["", "hello"])) - assert len(docs[0]) == 0 - assert len(docs[1]) == 1 - - -@pytest.mark.skip(reason="v3 no longer supports LEMMA/POS in tokenizer special cases") -@pytest.mark.issue(1250) -def test_issue1250(): - """Test cached special cases.""" - special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] - nlp = English() - nlp.tokenizer.add_special_case("reimbur", special_case) - lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")] - assert lemmas == ["reimburse", ",", "reimburse", "..."] - lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")] - assert lemmas == ["reimburse", ",", "reimburse", "..."] - - -@pytest.mark.issue(1257) -def test_issue1257(): - """Test that tokens compare correctly.""" - doc1 = Doc(Vocab(), words=["a", "b", "c"]) - doc2 = Doc(Vocab(), words=["a", "c", "e"]) - assert doc1[0] != doc2[0] - assert not doc1[0] == doc2[0] - - -@pytest.mark.issue(1375) -def test_issue1375(): - """Test that token.nbor() raises IndexError for out-of-bounds access.""" - doc = Doc(Vocab(), words=["0", "1", "2"]) - with pytest.raises(IndexError): - assert doc[0].nbor(-1) - assert doc[1].nbor(-1).text == "0" - with pytest.raises(IndexError): - assert doc[2].nbor(1) - assert doc[1].nbor(1).text == "2" - - -@pytest.mark.issue(1434) -def test_issue1434(): - """Test matches occur when optional element at end of short doc.""" - pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] - vocab = Vocab(lex_attr_getters=LEX_ATTRS) - hello_world = Doc(vocab, words=["Hello", "World"]) - hello = Doc(vocab, words=["Hello"]) - matcher = Matcher(vocab) - matcher.add("MyMatcher", [pattern]) - matches = matcher(hello_world) - assert matches - matches = matcher(hello) - assert matches - - -@pytest.mark.parametrize( - "string,start,end", - [ - ("a", 0, 1), - ("a b", 0, 2), - ("a c", 0, 1), - ("a b c", 0, 2), - ("a b b c", 0, 3), - ("a b b", 0, 3), - ], -) -@pytest.mark.issue(1450) -def test_issue1450(string, start, end): - """Test matcher works when patterns end with * operator.""" - pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] - matcher = Matcher(Vocab()) - matcher.add("TSTEND", [pattern]) - doc = Doc(Vocab(), words=string.split()) - matches = matcher(doc) - if start is None or end is None: - assert matches == [] - assert matches[-1][1] == start - assert matches[-1][2] == end - - -@pytest.mark.issue(1488) -def test_issue1488(): - prefix_re = re.compile(r"""[\[\("']""") - suffix_re = re.compile(r"""[\]\)"']""") - infix_re = re.compile(r"""[-~\.]""") - simple_url_re = re.compile(r"""^https?://""") - - def my_tokenizer(nlp): - return Tokenizer( - nlp.vocab, - {}, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=simple_url_re.match, - ) - - nlp = English() - nlp.tokenizer = my_tokenizer(nlp) - doc = nlp("This is a test.") - for token in doc: - assert token.text - - -@pytest.mark.issue(1494) -def test_issue1494(): - infix_re = re.compile(r"""[^a-z]""") - test_cases = [ - ("token 123test", ["token", "1", "2", "3", "test"]), - ("token 1test", ["token", "1test"]), - ("hello...test", ["hello", ".", ".", ".", "test"]), - ] - - def new_tokenizer(nlp): - return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) - - nlp = English() - nlp.tokenizer = new_tokenizer(nlp) - for text, expected in test_cases: - assert [token.text for token in nlp(text)] == expected diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py deleted file mode 100644 index 07f173843..000000000 --- a/spacy/tests/regression/test_issue1501-2000.py +++ /dev/null @@ -1,375 +0,0 @@ -import pytest -import gc -import numpy -import copy - -from spacy.training import Example -from spacy.lang.en import English -from spacy.lang.en.stop_words import STOP_WORDS -from spacy.lang.lex_attrs import is_stop -from spacy.vectors import Vectors -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.tokens import Doc, Span, Token -from spacy.attrs import HEAD, DEP -from spacy.matcher import Matcher - -from ..util import make_tempdir - - -@pytest.mark.issue(1506) -def test_issue1506(): - def string_generator(): - for _ in range(10001): - yield "It's sentence produced by that bug." - for _ in range(10001): - yield "I erase some hbdsaj lemmas." - for _ in range(10001): - yield "I erase lemmas." - for _ in range(10001): - yield "It's sentence produced by that bug." - for _ in range(10001): - yield "It's sentence produced by that bug." - - nlp = English() - for i, d in enumerate(nlp.pipe(string_generator())): - # We should run cleanup more than one time to actually cleanup data. - # In first run — clean up only mark strings as «not hitted». - if i == 10000 or i == 20000 or i == 30000: - gc.collect() - for t in d: - str(t.lemma_) - - -@pytest.mark.issue(1518) -def test_issue1518(): - """Test vectors.resize() works.""" - vectors = Vectors(shape=(10, 10)) - vectors.add("hello", row=2) - vectors.resize((5, 9)) - - -@pytest.mark.issue(1537) -def test_issue1537(): - """Test that Span.as_doc() doesn't segfault.""" - string = "The sky is blue . The man is pink . The dog is purple ." - doc = Doc(Vocab(), words=string.split()) - doc[0].sent_start = True - for word in doc[1:]: - if word.nbor(-1).text == ".": - word.sent_start = True - else: - word.sent_start = False - sents = list(doc.sents) - sent0 = sents[0].as_doc() - sent1 = sents[1].as_doc() - assert isinstance(sent0, Doc) - assert isinstance(sent1, Doc) - - -# TODO: Currently segfaulting, due to l_edge and r_edge misalignment -@pytest.mark.issue(1537) -# def test_issue1537_model(): -# nlp = load_spacy('en') -# doc = nlp('The sky is blue. The man is pink. The dog is purple.') -# sents = [s.as_doc() for s in doc.sents] -# print(list(sents[0].noun_chunks)) -# print(list(sents[1].noun_chunks)) - - -@pytest.mark.issue(1539) -def test_issue1539(): - """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" - v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) - v.resize((100, 100)) - - -@pytest.mark.issue(1547) -def test_issue1547(): - """Test that entity labels still match after merging tokens.""" - words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] - doc = Doc(Vocab(), words=words) - doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[5:7]) - assert [ent.text for ent in doc.ents] - - -@pytest.mark.issue(1612) -def test_issue1612(en_tokenizer): - doc = en_tokenizer("The black cat purrs.") - span = doc[1:3] - assert span.orth_ == span.text - - -@pytest.mark.issue(1654) -def test_issue1654(): - nlp = Language(Vocab()) - assert not nlp.pipeline - - @Language.component("component") - def component(doc): - return doc - - nlp.add_pipe("component", name="1") - nlp.add_pipe("component", name="2", after="1") - nlp.add_pipe("component", name="3", after="2") - assert nlp.pipe_names == ["1", "2", "3"] - nlp2 = Language(Vocab()) - assert not nlp2.pipeline - nlp2.add_pipe("component", name="3") - nlp2.add_pipe("component", name="2", before="3") - nlp2.add_pipe("component", name="1", before="2") - assert nlp2.pipe_names == ["1", "2", "3"] - - -@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) -@pytest.mark.issue(1698) -def test_issue1698(en_tokenizer, text): - doc = en_tokenizer(text) - assert len(doc) == 1 - assert not doc[0].like_url - - -@pytest.mark.issue(1727) -def test_issue1727(): - """Test that models with no pretrained vectors can be deserialized - correctly after vectors are added.""" - nlp = Language(Vocab()) - data = numpy.ones((3, 300), dtype="f") - vectors = Vectors(data=data, keys=["I", "am", "Matt"]) - tagger = nlp.create_pipe("tagger") - tagger.add_label("PRP") - assert tagger.cfg.get("pretrained_dims", 0) == 0 - tagger.vocab.vectors = vectors - with make_tempdir() as path: - tagger.to_disk(path) - tagger = nlp.create_pipe("tagger").from_disk(path) - assert tagger.cfg.get("pretrained_dims", 0) == 0 - - -@pytest.mark.issue(1757) -def test_issue1757(): - """Test comparison against None doesn't cause segfault.""" - doc = Doc(Vocab(), words=["a", "b", "c"]) - assert not doc[0] < None - assert not doc[0] is None - assert doc[0] >= None - assert not doc[:2] < None - assert not doc[:2] is None - assert doc[:2] >= None - assert not doc.vocab["a"] is None - assert not doc.vocab["a"] < None - - -@pytest.mark.issue(1758) -def test_issue1758(en_tokenizer): - """Test that "would've" is handled by the English tokenizer exceptions.""" - tokens = en_tokenizer("would've") - assert len(tokens) == 2 - - -@pytest.mark.issue(1773) -def test_issue1773(en_tokenizer): - """Test that spaces don't receive a POS but no TAG. This is the root cause - of the serialization issue reported in #1773.""" - doc = en_tokenizer("\n") - if doc[0].pos_ == "SPACE": - assert doc[0].tag_ != "" - - -@pytest.mark.issue(1799) -def test_issue1799(): - """Test sentence boundaries are deserialized correctly, even for - non-projective sentences.""" - heads_deps = numpy.asarray( - [ - [1, 397], - [4, 436], - [2, 426], - [1, 402], - [0, 8206900633647566924], - [18446744073709551615, 440], - [18446744073709551614, 442], - ], - dtype="uint64", - ) - doc = Doc(Vocab(), words="Just what I was looking for .".split()) - doc.vocab.strings.add("ROOT") - doc = doc.from_array([HEAD, DEP], heads_deps) - assert len(list(doc.sents)) == 1 - - -@pytest.mark.issue(1807) -def test_issue1807(): - """Test vocab.set_vector also adds the word to the vocab.""" - vocab = Vocab(vectors_name="test_issue1807") - assert "hello" not in vocab - vocab.set_vector("hello", numpy.ones((50,), dtype="f")) - assert "hello" in vocab - - -@pytest.mark.issue(1834) -def test_issue1834(): - """Test that sentence boundaries & parse/tag flags are not lost - during serialization.""" - words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"] - doc = Doc(Vocab(), words=words) - doc[6].is_sent_start = True - new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - assert new_doc[6].sent_start - assert not new_doc.has_annotation("DEP") - assert not new_doc.has_annotation("TAG") - doc = Doc( - Vocab(), - words=words, - tags=["TAG"] * len(words), - heads=[0, 0, 0, 0, 0, 0, 6, 6, 6], - deps=["dep"] * len(words), - ) - new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) - assert new_doc[6].sent_start - assert new_doc.has_annotation("DEP") - assert new_doc.has_annotation("TAG") - - -@pytest.mark.issue(1868) -def test_issue1868(): - """Test Vocab.__contains__ works with int keys.""" - vocab = Vocab() - lex = vocab["hello"] - assert lex.orth in vocab - assert lex.orth_ in vocab - assert "some string" not in vocab - int_id = vocab.strings.add("some string") - assert int_id not in vocab - - -@pytest.mark.issue(1883) -def test_issue1883(): - matcher = Matcher(Vocab()) - matcher.add("pat1", [[{"orth": "hello"}]]) - doc = Doc(matcher.vocab, words=["hello"]) - assert len(matcher(doc)) == 1 - new_matcher = copy.deepcopy(matcher) - new_doc = Doc(new_matcher.vocab, words=["hello"]) - assert len(new_matcher(new_doc)) == 1 - - -@pytest.mark.parametrize("word", ["the"]) -@pytest.mark.issue(1889) -def test_issue1889(word): - assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) - - -@pytest.mark.skip(reason="obsolete with the config refactor of v.3") -@pytest.mark.issue(1915) -def test_issue1915(): - cfg = {"hidden_depth": 2} # should error out - nlp = Language() - ner = nlp.add_pipe("ner") - ner.add_label("answer") - with pytest.raises(ValueError): - nlp.initialize(**cfg) - - -@pytest.mark.issue(1945) -def test_issue1945(): - """Test regression in Matcher introduced in v2.0.6.""" - matcher = Matcher(Vocab()) - matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) - doc = Doc(matcher.vocab, words=["a", "a", "a"]) - matches = matcher(doc) # we should see two overlapping matches here - assert len(matches) == 2 - assert matches[0][1:] == (0, 2) - assert matches[1][1:] == (1, 3) - - -@pytest.mark.issue(1963) -def test_issue1963(en_tokenizer): - """Test that doc.merge() resizes doc.tensor""" - doc = en_tokenizer("a b c d") - doc.tensor = numpy.ones((len(doc), 128), dtype="f") - with doc.retokenize() as retokenizer: - retokenizer.merge(doc[0:2]) - assert len(doc) == 3 - assert doc.tensor.shape == (3, 128) - - -@pytest.mark.parametrize("label", ["U-JOB-NAME"]) -@pytest.mark.issue(1967) -def test_issue1967(label): - nlp = Language() - config = {} - ner = nlp.create_pipe("ner", config=config) - example = Example.from_dict( - Doc(ner.vocab, words=["word"]), - { - "ids": [0], - "words": ["word"], - "tags": ["tag"], - "heads": [0], - "deps": ["dep"], - "entities": [label], - }, - ) - assert "JOB-NAME" in ner.moves.get_actions(examples=[example])[1] - - -@pytest.mark.issue(1971) -def test_issue1971(en_vocab): - # Possibly related to #2675 and #2671? - matcher = Matcher(en_vocab) - pattern = [ - {"ORTH": "Doe"}, - {"ORTH": "!", "OP": "?"}, - {"_": {"optional": True}, "OP": "?"}, - {"ORTH": "!", "OP": "?"}, - ] - Token.set_extension("optional", default=False) - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) - # We could also assert length 1 here, but this is more conclusive, because - # the real problem here is that it returns a duplicate match for a match_id - # that's not actually in the vocab! - matches = matcher(doc) - assert all([match_id in en_vocab.strings for match_id, start, end in matches]) - - -def test_issue_1971_2(en_vocab): - matcher = Matcher(en_vocab) - pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] - pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] - doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) - matcher.add("TEST1", [pattern1, pattern2]) - matches = matcher(doc) - assert len(matches) == 2 - - -def test_issue_1971_3(en_vocab): - """Test that pattern matches correctly for multiple extension attributes.""" - Token.set_extension("a", default=1, force=True) - Token.set_extension("b", default=2, force=True) - doc = Doc(en_vocab, words=["hello", "world"]) - matcher = Matcher(en_vocab) - matcher.add("A", [[{"_": {"a": 1}}]]) - matcher.add("B", [[{"_": {"b": 2}}]]) - matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) - assert len(matches) == 4 - assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) - - -def test_issue_1971_4(en_vocab): - """Test that pattern matches correctly with multiple extension attribute - values on a single token. - """ - Token.set_extension("ext_a", default="str_a", force=True) - Token.set_extension("ext_b", default="str_b", force=True) - matcher = Matcher(en_vocab) - doc = Doc(en_vocab, words=["this", "is", "text"]) - pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 - matcher.add("TEST", [pattern]) - matches = matcher(doc) - # Uncommenting this caused a segmentation fault - assert len(matches) == 1 - assert matches[0] == (en_vocab.strings["TEST"], 0, 3) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py deleted file mode 100644 index a07360c2c..000000000 --- a/spacy/tests/regression/test_issue2001-2500.py +++ /dev/null @@ -1,152 +0,0 @@ -import pytest -import numpy -from spacy.tokens import Doc -from spacy.matcher import Matcher -from spacy.displacy import render -from spacy.training import iob_to_biluo -from spacy.lang.it import Italian -from spacy.lang.en import English - -from ..util import add_vecs_to_vocab - - -@pytest.mark.skip( - reason="Can not be fixed without iterative looping between prefix/suffix and infix" -) -@pytest.mark.issue(2070) -def test_issue2070(): - """Test that checks that a dot followed by a quote is handled - appropriately. - """ - # Problem: The dot is now properly split off, but the prefix/suffix rules - # are not applied again afterwards. This means that the quote will still be - # attached to the remaining token. - nlp = English() - doc = nlp('First sentence."A quoted sentence" he said ...') - assert len(doc) == 11 - - -@pytest.mark.issue(2179) -def test_issue2179(): - """Test that spurious 'extra_labels' aren't created when initializing NER.""" - nlp = Italian() - ner = nlp.add_pipe("ner") - ner.add_label("CITIZENSHIP") - nlp.initialize() - nlp2 = Italian() - nlp2.add_pipe("ner") - assert len(nlp2.get_pipe("ner").labels) == 0 - model = nlp2.get_pipe("ner").model - model.attrs["resize_output"](model, nlp.get_pipe("ner").moves.n_moves) - nlp2.from_bytes(nlp.to_bytes()) - assert "extra_labels" not in nlp2.get_pipe("ner").cfg - assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",) - - -@pytest.mark.issue(2203) -def test_issue2203(en_vocab): - """Test that lemmas are set correctly in doc.from_array.""" - words = ["I", "'ll", "survive"] - tags = ["PRP", "MD", "VB"] - lemmas = ["-PRON-", "will", "survive"] - tag_ids = [en_vocab.strings.add(tag) for tag in tags] - lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas] - doc = Doc(en_vocab, words=words) - # Work around lemma corruption problem and set lemmas after tags - doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64")) - doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64")) - assert [t.tag_ for t in doc] == tags - assert [t.lemma_ for t in doc] == lemmas - # We need to serialize both tag and lemma, since this is what causes the bug - doc_array = doc.to_array(["TAG", "LEMMA"]) - new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array) - assert [t.tag_ for t in new_doc] == tags - assert [t.lemma_ for t in new_doc] == lemmas - - -@pytest.mark.issue(2219) -def test_issue2219(en_vocab): - vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] - add_vecs_to_vocab(en_vocab, vectors) - [(word1, vec1), (word2, vec2)] = vectors - doc = Doc(en_vocab, words=[word1, word2]) - assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) - - -@pytest.mark.issue(2361) -def test_issue2361(de_vocab): - chars = ("<", ">", "&", """) - words = ["<", ">", "&", '"'] - doc = Doc(de_vocab, words=words, deps=["dep"] * len(words)) - html = render(doc) - for char in chars: - assert char in html - - -@pytest.mark.issue(2385) -def test_issue2385(): - """Test that IOB tags are correctly converted to BILUO tags.""" - # fix bug in labels with a 'b' character - tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") - assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] - # maintain support for iob1 format - tags2 = ("I-ORG", "I-ORG", "B-ORG") - assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] - # maintain support for iob2 format - tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") - assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"] - - -@pytest.mark.parametrize( - "tags", - [ - ("B-ORG", "L-ORG"), - ("B-PERSON", "I-PERSON", "L-PERSON"), - ("U-BRAWLER", "U-BRAWLER"), - ], -) -@pytest.mark.issue(2385) -def test_issue2385_biluo(tags): - """Test that BILUO-compatible tags aren't modified.""" - assert iob_to_biluo(tags) == list(tags) - - -@pytest.mark.issue(2396) -def test_issue2396(en_vocab): - words = ["She", "created", "a", "test", "for", "spacy"] - heads = [1, 1, 3, 1, 3, 4] - deps = ["dep"] * len(heads) - matrix = numpy.array( - [ - [0, 1, 1, 1, 1, 1], - [1, 1, 1, 1, 1, 1], - [1, 1, 2, 3, 3, 3], - [1, 1, 3, 3, 3, 3], - [1, 1, 3, 3, 4, 4], - [1, 1, 3, 3, 4, 5], - ], - dtype=numpy.int32, - ) - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - span = doc[:] - assert (doc.get_lca_matrix() == matrix).all() - assert (span.get_lca_matrix() == matrix).all() - - -@pytest.mark.issue(2464) -def test_issue2464(en_vocab): - """Test problem with successive ?. This is the same bug, so putting it here.""" - matcher = Matcher(en_vocab) - doc = Doc(en_vocab, words=["a", "b"]) - matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]]) - matches = matcher(doc) - assert len(matches) == 3 - - -@pytest.mark.issue(2482) -def test_issue2482(): - """Test we can serialize and deserialize a blank NER or parser model.""" - nlp = Italian() - nlp.add_pipe("ner") - b = nlp.to_bytes() - Italian().from_bytes(b) diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py deleted file mode 100644 index cbb7f0621..000000000 --- a/spacy/tests/regression/test_issue2501-3000.py +++ /dev/null @@ -1,238 +0,0 @@ -import pytest -from spacy import displacy -from spacy.training import Example -from spacy.lang.en import English -from spacy.lang.ja import Japanese -from spacy.lang.xx import MultiLanguage -from spacy.language import Language -from spacy.matcher import Matcher -from spacy.tokens import Doc, Span -from spacy.vocab import Vocab -from spacy.compat import pickle -import numpy -import random - - -@pytest.mark.issue(2564) -def test_issue2564(): - """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" - nlp = Language() - tagger = nlp.add_pipe("tagger") - tagger.add_label("A") - nlp.initialize() - doc = nlp("hello world") - assert doc.has_annotation("TAG") - docs = nlp.pipe(["hello", "world"]) - piped_doc = next(docs) - assert piped_doc.has_annotation("TAG") - - -@pytest.mark.issue(2569) -def test_issue2569(en_tokenizer): - """Test that operator + is greedy.""" - doc = en_tokenizer("It is May 15, 1993.") - doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] - matcher = Matcher(doc.vocab) - matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]]) - matched = [doc[start:end] for _, start, end in matcher(doc)] - matched = sorted(matched, key=len, reverse=True) - assert len(matched) == 10 - assert len(matched[0]) == 4 - assert matched[0].text == "May 15, 1993" - - -@pytest.mark.parametrize( - "text", - [ - "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume", - "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:", - ], -) -@pytest.mark.issue(2626) -def test_issue2626_2835(en_tokenizer, text): - """Check that sentence doesn't cause an infinite loop in the tokenizer.""" - doc = en_tokenizer(text) - assert doc - - -@pytest.mark.issue(2656) -def test_issue2656(en_tokenizer): - """Test that tokenizer correctly splits off punctuation after numbers with - decimal points. - """ - doc = en_tokenizer("I went for 40.3, and got home by 10.0.") - assert len(doc) == 11 - assert doc[0].text == "I" - assert doc[1].text == "went" - assert doc[2].text == "for" - assert doc[3].text == "40.3" - assert doc[4].text == "," - assert doc[5].text == "and" - assert doc[6].text == "got" - assert doc[7].text == "home" - assert doc[8].text == "by" - assert doc[9].text == "10.0" - assert doc[10].text == "." - - -@pytest.mark.issue(2671) -def test_issue2671(): - """Ensure the correct entity ID is returned for matches with quantifiers. - See also #2675 - """ - nlp = English() - matcher = Matcher(nlp.vocab) - pattern_id = "test_pattern" - pattern = [ - {"LOWER": "high"}, - {"IS_PUNCT": True, "OP": "?"}, - {"LOWER": "adrenaline"}, - ] - matcher.add(pattern_id, [pattern]) - doc1 = nlp("This is a high-adrenaline situation.") - doc2 = nlp("This is a high adrenaline situation.") - matches1 = matcher(doc1) - for match_id, start, end in matches1: - assert nlp.vocab.strings[match_id] == pattern_id - matches2 = matcher(doc2) - for match_id, start, end in matches2: - assert nlp.vocab.strings[match_id] == pattern_id - - -@pytest.mark.issue(2728) -def test_issue2728(en_vocab): - """Test that displaCy ENT visualizer escapes HTML correctly.""" - doc = Doc(en_vocab, words=["test", "", "test"]) - doc.ents = [Span(doc, 0, 1, label="TEST")] - html = displacy.render(doc, style="ent") - assert "<RELEASE>" in html - doc.ents = [Span(doc, 1, 2, label="TEST")] - html = displacy.render(doc, style="ent") - assert "<RELEASE>" in html - - -@pytest.mark.issue(2754) -def test_issue2754(en_tokenizer): - """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" - a = en_tokenizer("a") - assert a[0].norm_ == "a" - am = en_tokenizer("am") - assert am[0].norm_ == "am" - - -@pytest.mark.issue(2772) -def test_issue2772(en_vocab): - """Test that deprojectivization doesn't mess up sentence boundaries.""" - # fmt: off - words = ["When", "we", "write", "or", "communicate", "virtually", ",", "we", "can", "hide", "our", "true", "feelings", "."] - # fmt: on - # A tree with a non-projective (i.e. crossing) arc - # The arcs (0, 4) and (2, 9) cross. - heads = [4, 2, 9, 2, 2, 4, 9, 9, 9, 9, 12, 12, 9, 9] - deps = ["dep"] * len(heads) - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - assert doc[1].is_sent_start is False - - -@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) -@pytest.mark.parametrize("lang_cls", [English, MultiLanguage]) -@pytest.mark.issue(2782) -def test_issue2782(text, lang_cls): - """Check that like_num handles + and - before number.""" - nlp = lang_cls() - doc = nlp(text) - assert len(doc) == 1 - assert doc[0].like_num - - -@pytest.mark.issue(2800) -def test_issue2800(): - """Test issue that arises when too many labels are added to NER model. - Used to cause segfault. - """ - nlp = English() - train_data = [] - train_data.extend( - [Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})] - ) - entity_types = [str(i) for i in range(1000)] - ner = nlp.add_pipe("ner") - for entity_type in list(entity_types): - ner.add_label(entity_type) - optimizer = nlp.initialize() - for i in range(20): - losses = {} - random.shuffle(train_data) - for example in train_data: - nlp.update([example], sgd=optimizer, losses=losses, drop=0.5) - - -@pytest.mark.issue(2822) -def test_issue2822(it_tokenizer): - """Test that the abbreviation of poco is kept as one word.""" - doc = it_tokenizer("Vuoi un po' di zucchero?") - assert len(doc) == 6 - assert doc[0].text == "Vuoi" - assert doc[1].text == "un" - assert doc[2].text == "po'" - assert doc[3].text == "di" - assert doc[4].text == "zucchero" - assert doc[5].text == "?" - - -@pytest.mark.issue(2833) -def test_issue2833(en_vocab): - """Test that a custom error is raised if a token or span is pickled.""" - doc = Doc(en_vocab, words=["Hello", "world"]) - with pytest.raises(NotImplementedError): - pickle.dumps(doc[0]) - with pytest.raises(NotImplementedError): - pickle.dumps(doc[0:2]) - - -@pytest.mark.issue(2871) -def test_issue2871(): - """Test that vectors recover the correct key for spaCy reserved words.""" - words = ["dog", "cat", "SUFFIX"] - vocab = Vocab(vectors_name="test_issue2871") - vocab.vectors.resize(shape=(3, 10)) - vector_data = numpy.zeros((3, 10), dtype="f") - for word in words: - _ = vocab[word] # noqa: F841 - vocab.set_vector(word, vector_data[0]) - vocab.vectors.name = "dummy_vectors" - assert vocab["dog"].rank == 0 - assert vocab["cat"].rank == 1 - assert vocab["SUFFIX"].rank == 2 - assert vocab.vectors.find(key="dog") == 0 - assert vocab.vectors.find(key="cat") == 1 - assert vocab.vectors.find(key="SUFFIX") == 2 - - -@pytest.mark.issue(2901) -def test_issue2901(): - """Test that `nlp` doesn't fail.""" - try: - nlp = Japanese() - except ImportError: - pytest.skip() - - doc = nlp("pythonが大好きです") - assert doc - - -@pytest.mark.issue(2926) -def test_issue2926(fr_tokenizer): - """Test that the tokenizer correctly splits tokens separated by a slash (/) - ending in a digit. - """ - doc = fr_tokenizer("Learn html5/css3/javascript/jquery") - assert len(doc) == 8 - assert doc[0].text == "Learn" - assert doc[1].text == "html5" - assert doc[2].text == "/" - assert doc[3].text == "css3" - assert doc[4].text == "/" - assert doc[5].text == "javascript" - assert doc[6].text == "/" - assert doc[7].text == "jquery" diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py deleted file mode 100644 index 6220003dc..000000000 --- a/spacy/tests/regression/test_issue3001-3500.py +++ /dev/null @@ -1,272 +0,0 @@ -import pytest -from spacy import registry -from spacy.lang.en import English -from spacy.lang.de import German -from spacy.pipeline.ner import DEFAULT_NER_MODEL -from spacy.pipeline import EntityRuler, EntityRecognizer -from spacy.matcher import Matcher, PhraseMatcher -from spacy.tokens import Doc -from spacy.vocab import Vocab -from spacy.attrs import ENT_IOB, ENT_TYPE -from spacy.compat import pickle -from spacy import displacy -from spacy.vectors import Vectors -import numpy - - -@pytest.mark.issue(3002) -def test_issue3002(): - """Test that the tokenizer doesn't hang on a long list of dots""" - nlp = German() - doc = nlp( - "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl" - ) - assert len(doc) == 5 - - -@pytest.mark.issue(3009) -def test_issue3009(en_vocab): - """Test problem with matcher quantifiers""" - patterns = [ - [{"ORTH": "has"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}], - [ - {"ORTH": "has"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"TAG": "IN"}, - ], - [ - {"ORTH": "has"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"TAG": "IN"}, - ], - ] - words = ["also", "has", "to", "do", "with"] - tags = ["RB", "VBZ", "TO", "VB", "IN"] - pos = ["ADV", "VERB", "ADP", "VERB", "ADP"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos) - matcher = Matcher(en_vocab) - for i, pattern in enumerate(patterns): - matcher.add(str(i), [pattern]) - matches = matcher(doc) - assert matches - - -@pytest.mark.issue(3012) -def test_issue3012(en_vocab): - """Test that the is_tagged attribute doesn't get overwritten when we from_array - without tag information.""" - words = ["This", "is", "10", "%", "."] - tags = ["DT", "VBZ", "CD", "NN", "."] - pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] - ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) - assert doc.has_annotation("TAG") - expected = ("10", "NUM", "CD", "PERCENT") - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - header = [ENT_IOB, ENT_TYPE] - ent_array = doc.to_array(header) - doc.from_array(header, ent_array) - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - # Serializing then deserializing - doc_bytes = doc.to_bytes() - doc2 = Doc(en_vocab).from_bytes(doc_bytes) - assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected - - -@pytest.mark.issue(3199) -def test_issue3199(): - """Test that Span.noun_chunks works correctly if no noun chunks iterator - is available. To make this test future-proof, we're constructing a Doc - with a new Vocab here and a parse tree to make sure the noun chunks run. - """ - words = ["This", "is", "a", "sentence"] - doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words)) - with pytest.raises(NotImplementedError): - list(doc[0:3].noun_chunks) - - -@pytest.mark.issue(3209) -def test_issue3209(): - """Test issue that occurred in spaCy nightly where NER labels were being - mapped to classes incorrectly after loading the model, when the labels - were added using ner.add_label(). - """ - nlp = English() - ner = nlp.add_pipe("ner") - ner.add_label("ANIMAL") - nlp.initialize() - move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] - assert ner.move_names == move_names - nlp2 = English() - ner2 = nlp2.add_pipe("ner") - model = ner2.model - model.attrs["resize_output"](model, ner.moves.n_moves) - nlp2.from_bytes(nlp.to_bytes()) - assert ner2.move_names == move_names - - -@pytest.mark.issue(3248) -def test_issue3248_1(): - """Test that the PhraseMatcher correctly reports its number of rules, not - total number of patterns.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) - matcher.add("TEST2", [nlp("d")]) - assert len(matcher) == 2 - - -@pytest.mark.issue(3248) -def test_issue3248_2(): - """Test that the PhraseMatcher can be pickled correctly.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) - matcher.add("TEST2", [nlp("d")]) - data = pickle.dumps(matcher) - new_matcher = pickle.loads(data) - assert len(new_matcher) == len(matcher) - - -@pytest.mark.issue(3277) -def test_issue3277(es_tokenizer): - """Test that hyphens are split correctly as prefixes.""" - doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") - assert len(doc) == 14 - assert doc[0].text == "\u2014" - assert doc[5].text == "\u2013" - assert doc[9].text == "\u2013" - - -@pytest.mark.issue(3288) -def test_issue3288(en_vocab): - """Test that retokenization works correctly via displaCy when punctuation - is merged onto the preceeding token and tensor is resized.""" - words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] - heads = [1, 1, 1, 4, 4, 6, 4, 4] - deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - doc.tensor = numpy.zeros((len(words), 96), dtype="float32") - displacy.render(doc) - - -@pytest.mark.issue(3289) -def test_issue3289(): - """Test that Language.to_bytes handles serializing a pipeline component - with an uninitialized model.""" - nlp = English() - nlp.add_pipe("textcat") - bytes_data = nlp.to_bytes() - new_nlp = English() - new_nlp.add_pipe("textcat") - new_nlp.from_bytes(bytes_data) - - -@pytest.mark.issue(3328) -def test_issue3328(en_vocab): - doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) - matcher = Matcher(en_vocab) - patterns = [ - [{"LOWER": {"IN": ["hello", "how"]}}], - [{"LOWER": {"IN": ["you", "doing"]}}], - ] - matcher.add("TEST", patterns) - matches = matcher(doc) - assert len(matches) == 4 - matched_texts = [doc[start:end].text for _, start, end in matches] - assert matched_texts == ["Hello", "how", "you", "doing"] - - -@pytest.mark.issue(3331) -def test_issue3331(en_vocab): - """Test that duplicate patterns for different rules result in multiple - matches, one per rule. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])]) - matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])]) - doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) - matches = matcher(doc) - assert len(matches) == 2 - match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] - assert sorted(match_ids) == ["A", "B"] - - -@pytest.mark.issue(3345) -def test_issue3345(): - """Test case where preset entity crosses sentence boundary.""" - nlp = English() - doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) - doc[4].is_sent_start = True - ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - cfg = {"model": DEFAULT_NER_MODEL} - model = registry.resolve(cfg, validate=True)["model"] - ner = EntityRecognizer(doc.vocab, model) - # Add the OUT action. I wouldn't have thought this would be necessary... - ner.moves.add_action(5, "") - ner.add_label("GPE") - doc = ruler(doc) - # Get into the state just before "New" - state = ner.moves.init_batch([doc])[0] - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - # Check that B-GPE is valid. - assert ner.moves.is_valid(state, "B-GPE") - - -@pytest.mark.issue(3412) -def test_issue3412(): - data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") - vectors = Vectors(data=data, keys=["A", "B", "C"]) - keys, best_rows, scores = vectors.most_similar( - numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f") - ) - assert best_rows[0] == 2 - - -@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") -@pytest.mark.issue(3449) -def test_issue3449(): - nlp = English() - nlp.add_pipe("sentencizer") - text1 = "He gave the ball to I. Do you want to go to the movies with I?" - text2 = "He gave the ball to I. Do you want to go to the movies with I?" - text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" - t1 = nlp(text1) - t2 = nlp(text2) - t3 = nlp(text3) - assert t1[5].text == "I" - assert t2[5].text == "I" - assert t3[5].text == "I" - - -@pytest.mark.issue(3456) -def test_issue3456(): - # this crashed because of a padding error in layer.ops.unflatten in thinc - nlp = English() - tagger = nlp.add_pipe("tagger") - tagger.add_label("A") - nlp.initialize() - list(nlp.pipe(["hi", ""])) - - -@pytest.mark.issue(3468) -def test_issue3468(): - """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can - be restored after serialization.""" - nlp = English() - nlp.add_pipe("sentencizer") - doc = nlp("Hello world") - assert doc[0].is_sent_start - assert doc.has_annotation("SENT_START") - assert len(list(doc.sents)) == 1 - doc_bytes = doc.to_bytes() - new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) - assert new_doc[0].is_sent_start - assert new_doc.has_annotation("SENT_START") - assert len(list(new_doc.sents)) == 1 diff --git a/spacy/tests/regression/test_issue3501-4000.py b/spacy/tests/regression/test_issue3501-4000.py deleted file mode 100644 index 5d9bc4e83..000000000 --- a/spacy/tests/regression/test_issue3501-4000.py +++ /dev/null @@ -1,492 +0,0 @@ -import pytest -from spacy.language import Language -from spacy.vocab import Vocab -from spacy.pipeline import EntityRuler, DependencyParser -from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL -from spacy import displacy, load -from spacy.displacy import parse_deps -from spacy.tokens import Doc, Token -from spacy.matcher import Matcher, PhraseMatcher -from spacy.errors import MatchPatternError -from spacy.util import minibatch -from spacy.training import Example -from spacy.lang.hi import Hindi -from spacy.lang.es import Spanish -from spacy.lang.en import English -from spacy.attrs import IS_ALPHA -from spacy import registry -from thinc.api import compounding -import spacy -import srsly -import numpy - -from ..util import make_tempdir - - -@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"]) -@pytest.mark.issue(3521) -def test_issue3521(en_tokenizer, word): - tok = en_tokenizer(word)[1] - # 'not' and 'would' should be stopwords, also in their abbreviated forms - assert tok.is_stop - - -def test_issue_3526_1(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - ruler_bytes = ruler.to_bytes() - assert len(ruler) == len(patterns) - assert len(ruler.labels) == 4 - assert ruler.overwrite - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(ruler_bytes) - assert len(new_ruler) == len(ruler) - assert len(new_ruler.labels) == 4 - assert new_ruler.overwrite == ruler.overwrite - assert new_ruler.ent_id_sep == ruler.ent_id_sep - - -def test_issue_3526_2(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - bytes_old_style = srsly.msgpack_dumps(ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_bytes(bytes_old_style) - assert len(new_ruler) == len(ruler) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert new_ruler.overwrite is not ruler.overwrite - - -def test_issue_3526_3(en_vocab): - patterns = [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - ] - nlp = Language(vocab=en_vocab) - ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) - with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler" - srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) - new_ruler = EntityRuler(nlp).from_disk(out_file) - for pattern in ruler.patterns: - assert pattern in new_ruler.patterns - assert len(new_ruler) == len(ruler) - assert new_ruler.overwrite is not ruler.overwrite - - -def test_issue_3526_4(en_vocab): - nlp = Language(vocab=en_vocab) - patterns = [{"label": "ORG", "pattern": "Apple"}] - config = {"overwrite_ents": True} - ruler = nlp.add_pipe("entity_ruler", config=config) - ruler.add_patterns(patterns) - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - ruler = nlp.get_pipe("entity_ruler") - assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert ruler.overwrite is True - nlp2 = load(tmpdir) - new_ruler = nlp2.get_pipe("entity_ruler") - assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] - assert new_ruler.overwrite is True - - -@pytest.mark.issue(3531) -def test_issue3531(): - """Test that displaCy renderer doesn't require "settings" key.""" - example_dep = { - "words": [ - {"text": "But", "tag": "CCONJ"}, - {"text": "Google", "tag": "PROPN"}, - {"text": "is", "tag": "VERB"}, - {"text": "starting", "tag": "VERB"}, - {"text": "from", "tag": "ADP"}, - {"text": "behind.", "tag": "ADV"}, - ], - "arcs": [ - {"start": 0, "end": 3, "label": "cc", "dir": "left"}, - {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, - {"start": 2, "end": 3, "label": "aux", "dir": "left"}, - {"start": 3, "end": 4, "label": "prep", "dir": "right"}, - {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, - ], - } - example_ent = { - "text": "But Google is starting from behind.", - "ents": [{"start": 4, "end": 10, "label": "ORG"}], - } - dep_html = displacy.render(example_dep, style="dep", manual=True) - assert dep_html - ent_html = displacy.render(example_ent, style="ent", manual=True) - assert ent_html - - -@pytest.mark.issue(3540) -def test_issue3540(en_vocab): - words = ["I", "live", "in", "NewYork", "right", "now"] - tensor = numpy.asarray( - [[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]], - dtype="f", - ) - doc = Doc(en_vocab, words=words) - doc.tensor = tensor - gold_text = ["I", "live", "in", "NewYork", "right", "now"] - assert [token.text for token in doc] == gold_text - gold_lemma = ["I", "live", "in", "NewYork", "right", "now"] - for i, lemma in enumerate(gold_lemma): - doc[i].lemma_ = lemma - assert [token.lemma_ for token in doc] == gold_lemma - vectors_1 = [token.vector for token in doc] - assert len(vectors_1) == len(doc) - - with doc.retokenize() as retokenizer: - heads = [(doc[3], 1), doc[2]] - attrs = { - "POS": ["PROPN", "PROPN"], - "LEMMA": ["New", "York"], - "DEP": ["pobj", "compound"], - } - retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) - - gold_text = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.text for token in doc] == gold_text - gold_lemma = ["I", "live", "in", "New", "York", "right", "now"] - assert [token.lemma_ for token in doc] == gold_lemma - vectors_2 = [token.vector for token in doc] - assert len(vectors_2) == len(doc) - assert vectors_1[0].tolist() == vectors_2[0].tolist() - assert vectors_1[1].tolist() == vectors_2[1].tolist() - assert vectors_1[2].tolist() == vectors_2[2].tolist() - assert vectors_1[4].tolist() == vectors_2[5].tolist() - assert vectors_1[5].tolist() == vectors_2[6].tolist() - - -@pytest.mark.issue(3549) -def test_issue3549(en_vocab): - """Test that match pattern validation doesn't raise on empty errors.""" - matcher = Matcher(en_vocab, validate=True) - pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] - matcher.add("GOOD", [pattern]) - with pytest.raises(MatchPatternError): - matcher.add("BAD", [[{"X": "Y"}]]) - - -@pytest.mark.skip("Matching currently only works on strings and integers") -@pytest.mark.issue(3555) -def test_issue3555(en_vocab): - """Test that custom extensions with default None don't break matcher.""" - Token.set_extension("issue3555", default=None) - matcher = Matcher(en_vocab) - pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["have", "apple"]) - matcher(doc) - - -@pytest.mark.issue(3611) -def test_issue3611(): - """Test whether adding n-grams in the textcat works even when n > token length of some docs""" - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - nlp = spacy.blank("en") - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - # add a text categorizer component - model = { - "@architectures": "spacy.TextCatBOW.v1", - "exclusive_classes": True, - "ngram_size": 2, - "no_output_layer": False, - } - textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) - for label in unique_classes: - textcat.add_label(label) - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.initialize() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) - - -@pytest.mark.issue(3625) -def test_issue3625(): - """Test that default punctuation rules applies to hindi unicode characters""" - nlp = Hindi() - doc = nlp("hi. how हुए. होटल, होटल") - expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"] - assert [token.text for token in doc] == expected - - -@pytest.mark.issue(3803) -def test_issue3803(): - """Test that spanish num-like tokens have True for like_num attribute.""" - nlp = Spanish() - text = "2 dos 1000 mil 12 doce" - doc = nlp(text) - - assert [t.like_num for t in doc] == [True, True, True, True, True, True] - - -def _parser_example(parser): - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - return Example.from_dict(doc, gold) - - -@pytest.mark.issue(3830) -def test_issue3830_no_subtok(): - """Test that the parser doesn't have subtok label if not learn_tokens""" - config = { - "learn_tokens": False, - } - model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] - parser = DependencyParser(Vocab(), model, **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.initialize(lambda: [_parser_example(parser)]) - assert "subtok" not in parser.labels - - -@pytest.mark.issue(3830) -def test_issue3830_with_subtok(): - """Test that the parser does have subtok label if learn_tokens=True.""" - config = { - "learn_tokens": True, - } - model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] - parser = DependencyParser(Vocab(), model, **config) - parser.add_label("nsubj") - assert "subtok" not in parser.labels - parser.initialize(lambda: [_parser_example(parser)]) - assert "subtok" in parser.labels - - -@pytest.mark.issue(3839) -def test_issue3839(en_vocab): - """Test that match IDs returned by the matcher are correct, are in the string""" - doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) - matcher = Matcher(en_vocab) - match_id = "PATTERN" - pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}] - pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}] - matcher.add(match_id, [pattern1]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - matcher = Matcher(en_vocab) - matcher.add(match_id, [pattern2]) - matches = matcher(doc) - assert matches[0][0] == en_vocab.strings[match_id] - - -@pytest.mark.parametrize( - "sentence", - [ - "The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.", - "The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one", - "Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.", - "It was a missed assignment, but it shouldn't have resulted in a turnover ...", - ], -) -@pytest.mark.issue(3869) -def test_issue3869(sentence): - """Test that the Doc's count_by function works consistently""" - nlp = English() - doc = nlp(sentence) - count = 0 - for token in doc: - count += token.is_alpha - assert count == doc.count_by(IS_ALPHA).get(1, 0) - - -@pytest.mark.issue(3879) -def test_issue3879(en_vocab): - doc = Doc(en_vocab, words=["This", "is", "a", "test", "."]) - assert len(doc) == 5 - pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}] - matcher = Matcher(en_vocab) - matcher.add("TEST", [pattern]) - assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test' - - -@pytest.mark.issue(3880) -def test_issue3880(): - """Test that `nlp.pipe()` works when an empty string ends the batch. - - Fixed in v7.0.5 of Thinc. - """ - texts = ["hello", "world", "", ""] - nlp = English() - nlp.add_pipe("parser").add_label("dep") - nlp.add_pipe("ner").add_label("PERSON") - nlp.add_pipe("tagger").add_label("NN") - nlp.initialize() - for doc in nlp.pipe(texts): - pass - - -@pytest.mark.issue(3882) -def test_issue3882(en_vocab): - """Test that displaCy doesn't serialize the doc.user_data when making a - copy of the Doc. - """ - doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"]) - doc.user_data["test"] = set() - parse_deps(doc) - - -@pytest.mark.issue(3951) -def test_issue3951(en_vocab): - """Test that combinations of optional rules are matched correctly.""" - matcher = Matcher(en_vocab) - pattern = [ - {"LOWER": "hello"}, - {"LOWER": "this", "OP": "?"}, - {"OP": "?"}, - {"LOWER": "world"}, - ] - matcher.add("TEST", [pattern]) - doc = Doc(en_vocab, words=["Hello", "my", "new", "world"]) - matches = matcher(doc) - assert len(matches) == 0 - - -@pytest.mark.issue(3959) -def test_issue3959(): - """Ensure that a modified pos attribute is serialized correctly.""" - nlp = English() - doc = nlp( - "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" - ) - assert doc[0].pos_ == "" - doc[0].pos_ = "NOUN" - assert doc[0].pos_ == "NOUN" - # usually this is already True when starting from proper models instead of blank English - with make_tempdir() as tmp_dir: - file_path = tmp_dir / "my_doc" - doc.to_disk(file_path) - doc2 = nlp("") - doc2.from_disk(file_path) - assert doc2[0].pos_ == "NOUN" - - -@pytest.mark.issue(3962) -def test_issue3962(en_vocab): - """Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - # fmt: off - words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."] - heads = [1, 7, 1, 2, 7, 7, 7, 7, 9, 7, 7] - deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] - # fmt: on - doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - span2 = doc[1:5] # "jests at scars ," - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - # head set to itself, being the new artificial root - assert doc2[0].head.text == "jests" - assert doc2[0].dep_ == "dep" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" # head set to the new artificial root - assert doc2[3].dep_ == "dep" - # We should still have 1 sentence - assert len(list(doc2.sents)) == 1 - span3 = doc[6:9] # "never felt a" - doc3 = span3.as_doc() - doc3_json = doc3.to_json() - assert doc3_json - assert doc3[0].head.text == "felt" - assert doc3[0].dep_ == "neg" - assert doc3[1].head.text == "felt" - assert doc3[1].dep_ == "ROOT" - assert doc3[2].head.text == "felt" # head set to ancestor - assert doc3[2].dep_ == "dep" - # We should still have 1 sentence as "a" can be attached to "felt" instead of "wound" - assert len(list(doc3.sents)) == 1 - - -@pytest.mark.issue(3962) -def test_issue3962_long(en_vocab): - """Ensure that as_doc does not result in out-of-bound access of tokens. - This is achieved by setting the head to itself if it would lie out of the span otherwise.""" - # fmt: off - words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."] - heads = [1, 1, 1, 2, 1, 7, 7, 7, 9, 7, 7] - deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"] - # fmt: on - two_sent_doc = Doc(en_vocab, words=words, heads=heads, deps=deps) - span2 = two_sent_doc[1:7] # "jests at scars. They never" - doc2 = span2.as_doc() - doc2_json = doc2.to_json() - assert doc2_json - # head set to itself, being the new artificial root (in sentence 1) - assert doc2[0].head.text == "jests" - assert doc2[0].dep_ == "ROOT" - assert doc2[1].head.text == "jests" - assert doc2[1].dep_ == "prep" - assert doc2[2].head.text == "at" - assert doc2[2].dep_ == "pobj" - assert doc2[3].head.text == "jests" - assert doc2[3].dep_ == "punct" - # head set to itself, being the new artificial root (in sentence 2) - assert doc2[4].head.text == "They" - assert doc2[4].dep_ == "dep" - # head set to the new artificial head (in sentence 2) - assert doc2[4].head.text == "They" - assert doc2[4].dep_ == "dep" - # We should still have 2 sentences - sents = list(doc2.sents) - assert len(sents) == 2 - assert sents[0].text == "jests at scars ." - assert sents[1].text == "They never" - - -@pytest.mark.issue(3972) -def test_issue3972(en_vocab): - """Test that the PhraseMatcher returns duplicates for duplicate match IDs.""" - matcher = PhraseMatcher(en_vocab) - matcher.add("A", [Doc(en_vocab, words=["New", "York"])]) - matcher.add("B", [Doc(en_vocab, words=["New", "York"])]) - doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"]) - matches = matcher(doc) - - assert len(matches) == 2 - - # We should have a match for each of the two rules - found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches] - assert "A" in found_ids - assert "B" in found_ids diff --git a/spacy/tests/regression/test_issue4001-4500.py b/spacy/tests/regression/test_issue4001-4500.py deleted file mode 100644 index 7b7c304a3..000000000 --- a/spacy/tests/regression/test_issue4001-4500.py +++ /dev/null @@ -1,447 +0,0 @@ -import pytest -from spacy.pipeline import TrainablePipe -from spacy.matcher import PhraseMatcher, Matcher -from spacy.tokens import Doc, Span, DocBin -from spacy.training import Example, Corpus -from spacy.training.converters import json_to_docs -from spacy.vocab import Vocab -from spacy.lang.en import English -from spacy.util import minibatch, ensure_path, load_model -from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex -from spacy.tokenizer import Tokenizer -from spacy.lang.el import Greek -from spacy.language import Language -import spacy -from thinc.api import compounding - -from ..util import make_tempdir - - -@pytest.mark.issue(4002) -def test_issue4002(en_vocab): - """Test that the PhraseMatcher can match on overwritten NORM attributes.""" - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern1 = Doc(en_vocab, words=["c", "d"]) - assert [t.norm_ for t in pattern1] == ["c", "d"] - matcher.add("TEST", [pattern1]) - doc = Doc(en_vocab, words=["a", "b", "c", "d"]) - assert [t.norm_ for t in doc] == ["a", "b", "c", "d"] - matches = matcher(doc) - assert len(matches) == 1 - matcher = PhraseMatcher(en_vocab, attr="NORM") - pattern2 = Doc(en_vocab, words=["1", "2"]) - pattern2[0].norm_ = "c" - pattern2[1].norm_ = "d" - assert [t.norm_ for t in pattern2] == ["c", "d"] - matcher.add("TEST", [pattern2]) - matches = matcher(doc) - assert len(matches) == 1 - - -@pytest.mark.issue(4030) -def test_issue4030(): - """Test whether textcat works fine with empty doc""" - unique_classes = ["offensive", "inoffensive"] - x_train = [ - "This is an offensive text", - "This is the second offensive text", - "inoff", - ] - y_train = ["offensive", "offensive", "inoffensive"] - nlp = spacy.blank("en") - # preparing the data - train_data = [] - for text, train_instance in zip(x_train, y_train): - cat_dict = {label: label == train_instance for label in unique_classes} - train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) - # add a text categorizer component - model = { - "@architectures": "spacy.TextCatBOW.v1", - "exclusive_classes": True, - "ngram_size": 2, - "no_output_layer": False, - } - textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) - for label in unique_classes: - textcat.add_label(label) - # training the network - with nlp.select_pipes(enable="textcat"): - optimizer = nlp.initialize() - for i in range(3): - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) - - for batch in batches: - nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses) - # processing of an empty doc should result in 0.0 for all categories - doc = nlp("") - assert doc.cats["offensive"] == 0.0 - assert doc.cats["inoffensive"] == 0.0 - - -@pytest.mark.issue(4042) -def test_issue4042(): - """Test that serialization of an EntityRuler before NER works fine.""" - nlp = English() - # add ner pipe - ner = nlp.add_pipe("ner") - ner.add_label("SOME_LABEL") - nlp.initialize() - # Add entity ruler - patterns = [ - {"label": "MY_ORG", "pattern": "Apple"}, - {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, - ] - # works fine with "after" - ruler = nlp.add_pipe("entity_ruler", before="ner") - ruler.add_patterns(patterns) - doc1 = nlp("What do you think about Apple ?") - assert doc1.ents[0].label_ == "MY_ORG" - - with make_tempdir() as d: - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - nlp.to_disk(output_dir) - nlp2 = load_model(output_dir) - doc2 = nlp2("What do you think about Apple ?") - assert doc2.ents[0].label_ == "MY_ORG" - - -@pytest.mark.issue(4042) -def test_issue4042_bug2(): - """ - Test that serialization of an NER works fine when new labels were added. - This is the second bug of two bugs underlying the issue 4042. - """ - nlp1 = English() - # add ner pipe - ner1 = nlp1.add_pipe("ner") - ner1.add_label("SOME_LABEL") - nlp1.initialize() - # add a new label to the doc - doc1 = nlp1("What do you think about Apple ?") - assert len(ner1.labels) == 1 - assert "SOME_LABEL" in ner1.labels - apple_ent = Span(doc1, 5, 6, label="MY_ORG") - doc1.ents = list(doc1.ents) + [apple_ent] - # Add the label explicitly. Previously we didn't require this. - ner1.add_label("MY_ORG") - ner1(doc1) - assert len(ner1.labels) == 2 - assert "SOME_LABEL" in ner1.labels - assert "MY_ORG" in ner1.labels - with make_tempdir() as d: - # assert IO goes fine - output_dir = ensure_path(d) - if not output_dir.exists(): - output_dir.mkdir() - ner1.to_disk(output_dir) - config = {} - ner2 = nlp1.create_pipe("ner", config=config) - ner2.from_disk(output_dir) - assert len(ner2.labels) == 2 - - -@pytest.mark.issue(4054) -def test_issue4054(en_vocab): - """Test that a new blank model can be made with a vocab from file, - and that serialization does not drop the language at any point.""" - nlp1 = English() - vocab1 = nlp1.vocab - with make_tempdir() as d: - vocab_dir = ensure_path(d / "vocab") - if not vocab_dir.exists(): - vocab_dir.mkdir() - vocab1.to_disk(vocab_dir) - vocab2 = Vocab().from_disk(vocab_dir) - nlp2 = spacy.blank("en", vocab=vocab2) - nlp_dir = ensure_path(d / "nlp") - if not nlp_dir.exists(): - nlp_dir.mkdir() - nlp2.to_disk(nlp_dir) - nlp3 = load_model(nlp_dir) - assert nlp3.lang == "en" - - -@pytest.mark.issue(4120) -def test_issue4120(en_vocab): - """Test that matches without a final {OP: ?} token are returned.""" - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]]) - doc1 = Doc(en_vocab, words=["a"]) - assert len(matcher(doc1)) == 1 # works - doc2 = Doc(en_vocab, words=["a", "b", "c"]) - assert len(matcher(doc2)) == 2 # fixed - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]]) - doc3 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc3)) == 2 # works - matcher = Matcher(en_vocab) - matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]]) - doc4 = Doc(en_vocab, words=["a", "b", "b", "c"]) - assert len(matcher(doc4)) == 3 # fixed - - -@pytest.mark.issue(4133) -def test_issue4133(en_vocab): - nlp = English() - vocab_bytes = nlp.vocab.to_bytes() - words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] - pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] - doc = Doc(en_vocab, words=words) - for i, token in enumerate(doc): - token.pos_ = pos[i] - # usually this is already True when starting from proper models instead of blank English - doc_bytes = doc.to_bytes() - vocab = Vocab() - vocab = vocab.from_bytes(vocab_bytes) - doc = Doc(vocab).from_bytes(doc_bytes) - actual = [] - for token in doc: - actual.append(token.pos_) - assert actual == pos - - -@pytest.mark.issue(4190) -def test_issue4190(): - def customize_tokenizer(nlp): - prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) - suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) - infix_re = compile_infix_regex(nlp.Defaults.infixes) - # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') - exceptions = { - k: v - for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() - if not (len(k) == 2 and k[1] == ".") - } - new_tokenizer = Tokenizer( - nlp.vocab, - exceptions, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=nlp.tokenizer.token_match, - ) - nlp.tokenizer = new_tokenizer - - test_string = "Test c." - # Load default language - nlp_1 = English() - doc_1a = nlp_1(test_string) - result_1a = [token.text for token in doc_1a] # noqa: F841 - # Modify tokenizer - customize_tokenizer(nlp_1) - doc_1b = nlp_1(test_string) - result_1b = [token.text for token in doc_1b] - # Save and Reload - with make_tempdir() as model_dir: - nlp_1.to_disk(model_dir) - nlp_2 = load_model(model_dir) - # This should be the modified tokenizer - doc_2 = nlp_2(test_string) - result_2 = [token.text for token in doc_2] - assert result_1b == result_2 - - -@pytest.mark.issue(4267) -def test_issue4267(): - """Test that running an entity_ruler after ner gives consistent results""" - nlp = English() - ner = nlp.add_pipe("ner") - ner.add_label("PEOPLE") - nlp.initialize() - assert "ner" in nlp.pipe_names - # assert that we have correct IOB annotations - doc1 = nlp("hi") - assert doc1.has_annotation("ENT_IOB") - for token in doc1: - assert token.ent_iob == 2 - # add entity ruler and run again - patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - assert "entity_ruler" in nlp.pipe_names - assert "ner" in nlp.pipe_names - # assert that we still have correct IOB annotations - doc2 = nlp("hi") - assert doc2.has_annotation("ENT_IOB") - for token in doc2: - assert token.ent_iob == 2 - - -@pytest.mark.skip(reason="lemmatizer lookups no longer in vocab") -@pytest.mark.issue(4272) -def test_issue4272(): - """Test that lookup table can be accessed from Token.lemma if no POS tags - are available.""" - nlp = Greek() - doc = nlp("Χθες") - assert doc[0].lemma_ - - -def test_multiple_predictions(): - class DummyPipe(TrainablePipe): - def __init__(self): - self.model = "dummy_model" - - def predict(self, docs): - return ([1, 2, 3], [4, 5, 6]) - - def set_annotations(self, docs, scores): - return docs - - nlp = Language() - doc = nlp.make_doc("foo") - dummy_pipe = DummyPipe() - dummy_pipe(doc) - - -@pytest.mark.issue(4313) -def test_issue4313(): - """This should not crash or exit with some strange error code""" - beam_width = 16 - beam_density = 0.0001 - nlp = English() - config = { - "beam_width": beam_width, - "beam_density": beam_density, - } - ner = nlp.add_pipe("beam_ner", config=config) - ner.add_label("SOME_LABEL") - nlp.initialize() - # add a new label to the doc - doc = nlp("What do you think about Apple ?") - assert len(ner.labels) == 1 - assert "SOME_LABEL" in ner.labels - apple_ent = Span(doc, 5, 6, label="MY_ORG") - doc.ents = list(doc.ents) + [apple_ent] - - # ensure the beam_parse still works with the new label - docs = [doc] - ner.beam_parse(docs, drop=0.0, beam_width=beam_width, beam_density=beam_density) - assert len(ner.labels) == 2 - assert "MY_ORG" in ner.labels - - -@pytest.mark.issue(4348) -def test_issue4348(): - """Test that training the tagger with empty data, doesn't throw errors""" - nlp = English() - example = Example.from_dict(nlp.make_doc(""), {"tags": []}) - TRAIN_DATA = [example, example] - tagger = nlp.add_pipe("tagger") - tagger.add_label("A") - optimizer = nlp.initialize() - for i in range(5): - losses = {} - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) - for batch in batches: - nlp.update(batch, sgd=optimizer, losses=losses) - - -@pytest.mark.issue(4367) -def test_issue4367(): - """Test that docbin init goes well""" - DocBin() - DocBin(attrs=["LEMMA"]) - DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) - - -@pytest.mark.issue(4373) -def test_issue4373(): - """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" - matcher = Matcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - matcher = PhraseMatcher(Vocab()) - assert isinstance(matcher.vocab, Vocab) - - -@pytest.mark.issue(4402) -def test_issue4402(): - json_data = { - "id": 0, - "paragraphs": [ - { - "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "How", "ner": "O"}, - {"id": 1, "orth": "should", "ner": "O"}, - {"id": 2, "orth": "I", "ner": "O"}, - {"id": 3, "orth": "cook", "ner": "O"}, - {"id": 4, "orth": "bacon", "ner": "O"}, - {"id": 5, "orth": "in", "ner": "O"}, - {"id": 6, "orth": "an", "ner": "O"}, - {"id": 7, "orth": "oven", "ner": "O"}, - {"id": 8, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - { - "tokens": [ - {"id": 9, "orth": "\n", "ner": "O"}, - {"id": 10, "orth": "I", "ner": "O"}, - {"id": 11, "orth": "'ve", "ner": "O"}, - {"id": 12, "orth": "heard", "ner": "O"}, - {"id": 13, "orth": "of", "ner": "O"}, - {"id": 14, "orth": "people", "ner": "O"}, - {"id": 15, "orth": "cooking", "ner": "O"}, - {"id": 16, "orth": "bacon", "ner": "O"}, - {"id": 17, "orth": "in", "ner": "O"}, - {"id": 18, "orth": "an", "ner": "O"}, - {"id": 19, "orth": "oven", "ner": "O"}, - {"id": 20, "orth": ".", "ner": "O"}, - ], - "brackets": [], - }, - ], - "cats": [ - {"label": "baking", "value": 1.0}, - {"label": "not_baking", "value": 0.0}, - ], - }, - { - "raw": "What is the difference between white and brown eggs?\n", - "sentences": [ - { - "tokens": [ - {"id": 0, "orth": "What", "ner": "O"}, - {"id": 1, "orth": "is", "ner": "O"}, - {"id": 2, "orth": "the", "ner": "O"}, - {"id": 3, "orth": "difference", "ner": "O"}, - {"id": 4, "orth": "between", "ner": "O"}, - {"id": 5, "orth": "white", "ner": "O"}, - {"id": 6, "orth": "and", "ner": "O"}, - {"id": 7, "orth": "brown", "ner": "O"}, - {"id": 8, "orth": "eggs", "ner": "O"}, - {"id": 9, "orth": "?", "ner": "O"}, - ], - "brackets": [], - }, - {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, - ], - "cats": [ - {"label": "baking", "value": 0.0}, - {"label": "not_baking", "value": 1.0}, - ], - }, - ], - } - nlp = English() - attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] - with make_tempdir() as tmpdir: - output_file = tmpdir / "test4402.spacy" - docs = json_to_docs([json_data]) - data = DocBin(docs=docs, attrs=attrs).to_bytes() - with output_file.open("wb") as file_: - file_.write(data) - reader = Corpus(output_file) - train_data = list(reader(nlp)) - assert len(train_data) == 2 - - split_train_data = [] - for eg in train_data: - split_train_data.extend(eg.split_sents()) - assert len(split_train_data) == 4 diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py deleted file mode 100644 index 07a00d2b7..000000000 --- a/spacy/tests/regression/test_issue4501-5000.py +++ /dev/null @@ -1,266 +0,0 @@ -import pytest -from spacy.tokens import Doc, Span, DocBin -from spacy.training import Example -from spacy.training.converters.conllu_to_docs import conllu_to_docs -from spacy.lang.en import English -from spacy.kb import KnowledgeBase -from spacy.vocab import Vocab -from spacy.language import Language -from spacy.util import ensure_path, load_model_from_path -import numpy -import pickle -from thinc.api import NumpyOps, get_current_ops - -from ..util import make_tempdir - - -@pytest.mark.issue(4528) -def test_issue4528(en_vocab): - """Test that user_data is correctly serialized in DocBin.""" - doc = Doc(en_vocab, words=["hello", "world"]) - doc.user_data["foo"] = "bar" - # This is how extension attribute values are stored in the user data - doc.user_data[("._.", "foo", None, None)] = "bar" - doc_bin = DocBin(store_user_data=True) - doc_bin.add(doc) - doc_bin_bytes = doc_bin.to_bytes() - new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) - new_doc = list(new_doc_bin.get_docs(en_vocab))[0] - assert new_doc.user_data["foo"] == "bar" - assert new_doc.user_data[("._.", "foo", None, None)] == "bar" - - -@pytest.mark.parametrize( - "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] -) -def test_gold_misaligned(en_tokenizer, text, words): - doc = en_tokenizer(text) - Example.from_dict(doc, {"words": words}) - - -@pytest.mark.issue(4651) -def test_issue4651_with_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialized correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - specified. - """ - text = "Spacy is a python library for nlp" - nlp = English() - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) - ruler.add_patterns(patterns) - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - assert res == res_reloaded - - -@pytest.mark.issue(4651) -def test_issue4651_without_phrase_matcher_attr(): - """Test that the EntityRuler PhraseMatcher is deserialized correctly using - the method from_disk when the EntityRuler argument phrase_matcher_attr is - not specified. - """ - text = "Spacy is a python library for nlp" - nlp = English() - patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - doc = nlp(text) - res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents] - nlp_reloaded = English() - with make_tempdir() as d: - file_path = d / "entityruler" - ruler.to_disk(file_path) - nlp_reloaded.add_pipe("entity_ruler").from_disk(file_path) - doc_reloaded = nlp_reloaded(text) - res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents] - assert res == res_reloaded - - -@pytest.mark.issue(4665) -def test_issue4665(): - """ - conllu_to_docs should not raise an exception if the HEAD column contains an - underscore - """ - input_data = """ -1 [ _ PUNCT -LRB- _ _ punct _ _ -2 This _ DET DT _ _ det _ _ -3 killing _ NOUN NN _ _ nsubj _ _ -4 of _ ADP IN _ _ case _ _ -5 a _ DET DT _ _ det _ _ -6 respected _ ADJ JJ _ _ amod _ _ -7 cleric _ NOUN NN _ _ nmod _ _ -8 will _ AUX MD _ _ aux _ _ -9 be _ AUX VB _ _ aux _ _ -10 causing _ VERB VBG _ _ root _ _ -11 us _ PRON PRP _ _ iobj _ _ -12 trouble _ NOUN NN _ _ dobj _ _ -13 for _ ADP IN _ _ case _ _ -14 years _ NOUN NNS _ _ nmod _ _ -15 to _ PART TO _ _ mark _ _ -16 come _ VERB VB _ _ acl _ _ -17 . _ PUNCT . _ _ punct _ _ -18 ] _ PUNCT -RRB- _ _ punct _ _ -""" - conllu_to_docs(input_data) - - -@pytest.mark.issue(4674) -def test_issue4674(): - """Test that setting entities with overlapping identifiers does not mess up IO""" - nlp = English() - kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) - vector1 = [0.9, 1.1, 1.01] - vector2 = [1.8, 2.25, 2.01] - with pytest.warns(UserWarning): - kb.set_entities( - entity_list=["Q1", "Q1"], - freq_list=[32, 111], - vector_list=[vector1, vector2], - ) - assert kb.get_size_entities() == 1 - # dumping to file & loading back in - with make_tempdir() as d: - dir_path = ensure_path(d) - if not dir_path.exists(): - dir_path.mkdir() - file_path = dir_path / "kb" - kb.to_disk(str(file_path)) - kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) - kb2.from_disk(str(file_path)) - assert kb2.get_size_entities() == 1 - - -@pytest.mark.skip(reason="API change: disable just disables, new exclude arg") -@pytest.mark.issue(4707) -def test_issue4707(): - """Tests that disabled component names are also excluded from nlp.from_disk - by default when loading a model. - """ - nlp = English() - nlp.add_pipe("sentencizer") - nlp.add_pipe("entity_ruler") - assert nlp.pipe_names == ["sentencizer", "entity_ruler"] - exclude = ["tokenizer", "sentencizer"] - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir, exclude=exclude) - new_nlp = load_model_from_path(tmpdir, disable=exclude) - assert "sentencizer" not in new_nlp.pipe_names - assert "entity_ruler" in new_nlp.pipe_names - - -@pytest.mark.issue(4725) -def test_issue4725_1(): - """Ensure the pickling of the NER goes well""" - vocab = Vocab(vectors_name="test_vocab_add_vector") - nlp = English(vocab=vocab) - config = { - "update_with_oracle_cut_size": 111, - } - ner = nlp.create_pipe("ner", config=config) - with make_tempdir() as tmp_path: - with (tmp_path / "ner.pkl").open("wb") as file_: - pickle.dump(ner, file_) - assert ner.cfg["update_with_oracle_cut_size"] == 111 - - with (tmp_path / "ner.pkl").open("rb") as file_: - ner2 = pickle.load(file_) - assert ner2.cfg["update_with_oracle_cut_size"] == 111 - - -@pytest.mark.issue(4725) -def test_issue4725_2(): - if isinstance(get_current_ops, NumpyOps): - # ensures that this runs correctly and doesn't hang or crash because of the global vectors - # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), - # or because of issues with pickling the NER (cf test_issue4725_1) - vocab = Vocab(vectors_name="test_vocab_add_vector") - data = numpy.ndarray((5, 3), dtype="f") - data[0] = 1.0 - data[1] = 2.0 - vocab.set_vector("cat", data[0]) - vocab.set_vector("dog", data[1]) - nlp = English(vocab=vocab) - nlp.add_pipe("ner") - nlp.initialize() - docs = ["Kurt is in London."] * 10 - for _ in nlp.pipe(docs, batch_size=2, n_process=2): - pass - - -@pytest.mark.issue(4849) -def test_issue4849(): - nlp = English() - patterns = [ - {"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"}, - {"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"}, - ] - ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) - ruler.add_patterns(patterns) - text = """ - The left is starting to take aim at Democratic front-runner Joe Biden. - Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy." - """ - # USING 1 PROCESS - count_ents = 0 - for doc in nlp.pipe([text], n_process=1): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - # USING 2 PROCESSES - if isinstance(get_current_ops, NumpyOps): - count_ents = 0 - for doc in nlp.pipe([text], n_process=2): - count_ents += len([ent for ent in doc.ents if ent.ent_id > 0]) - assert count_ents == 2 - - -@Language.factory("my_pipe") -class CustomPipe: - def __init__(self, nlp, name="my_pipe"): - self.name = name - Span.set_extension("my_ext", getter=self._get_my_ext) - Doc.set_extension("my_ext", default=None) - - def __call__(self, doc): - gathered_ext = [] - for sent in doc.sents: - sent_ext = self._get_my_ext(sent) - sent._.set("my_ext", sent_ext) - gathered_ext.append(sent_ext) - - doc._.set("my_ext", "\n".join(gathered_ext)) - return doc - - @staticmethod - def _get_my_ext(span): - return str(span.end) - - -@pytest.mark.issue(4903) -def test_issue4903(): - """Ensure that this runs correctly and doesn't hang or crash on Windows / - macOS.""" - nlp = English() - nlp.add_pipe("sentencizer") - nlp.add_pipe("my_pipe", after="sentencizer") - text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."] - if isinstance(get_current_ops(), NumpyOps): - docs = list(nlp.pipe(text, n_process=2)) - assert docs[0].text == "I like bananas." - assert docs[1].text == "Do you like them?" - assert docs[2].text == "No, I prefer wasabi." - - -@pytest.mark.issue(4924) -def test_issue4924(): - nlp = Language() - example = Example.from_dict(nlp.make_doc(""), {}) - nlp.evaluate([example]) diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py deleted file mode 100644 index e1f5231e7..000000000 --- a/spacy/tests/regression/test_issue5001-5500.py +++ /dev/null @@ -1,149 +0,0 @@ -import numpy -from spacy.tokens import Doc, DocBin -from spacy.attrs import DEP, POS, TAG -from spacy.lang.en import English -from spacy.language import Language -from spacy.lang.en.syntax_iterators import noun_chunks -from spacy.vocab import Vocab -import spacy -from thinc.api import get_current_ops -import pytest - -from ...util import make_tempdir - - -@pytest.mark.issue(5048) -def test_issue5048(en_vocab): - words = ["This", "is", "a", "sentence"] - pos_s = ["DET", "VERB", "DET", "NOUN"] - spaces = [" ", " ", " ", ""] - deps_s = ["dep", "adj", "nn", "atm"] - tags_s = ["DT", "VBZ", "DT", "NN"] - strings = en_vocab.strings - for w in words: - strings.add(w) - deps = [strings.add(d) for d in deps_s] - pos = [strings.add(p) for p in pos_s] - tags = [strings.add(t) for t in tags_s] - attrs = [POS, DEP, TAG] - array = numpy.array(list(zip(pos, deps, tags)), dtype="uint64") - doc = Doc(en_vocab, words=words, spaces=spaces) - doc.from_array(attrs, array) - v1 = [(token.text, token.pos_, token.tag_) for token in doc] - doc2 = Doc(en_vocab, words=words, pos=pos_s, deps=deps_s, tags=tags_s) - v2 = [(token.text, token.pos_, token.tag_) for token in doc2] - assert v1 == v2 - - -@pytest.mark.issue(5082) -def test_issue5082(): - # Ensure the 'merge_entities' pipeline does something sensible for the vectors of the merged tokens - nlp = English() - vocab = nlp.vocab - array1 = numpy.asarray([0.1, 0.5, 0.8], dtype=numpy.float32) - array2 = numpy.asarray([-0.2, -0.6, -0.9], dtype=numpy.float32) - array3 = numpy.asarray([0.3, -0.1, 0.7], dtype=numpy.float32) - array4 = numpy.asarray([0.5, 0, 0.3], dtype=numpy.float32) - array34 = numpy.asarray([0.4, -0.05, 0.5], dtype=numpy.float32) - vocab.set_vector("I", array1) - vocab.set_vector("like", array2) - vocab.set_vector("David", array3) - vocab.set_vector("Bowie", array4) - text = "I like David Bowie" - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "david"}, {"LOWER": "bowie"}]} - ] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - parsed_vectors_1 = [t.vector for t in nlp(text)] - assert len(parsed_vectors_1) == 4 - ops = get_current_ops() - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4) - nlp.add_pipe("merge_entities") - parsed_vectors_2 = [t.vector for t in nlp(text)] - assert len(parsed_vectors_2) == 3 - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2) - numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34) - - -@pytest.mark.issue(5137) -def test_issue5137(): - factory_name = "test_issue5137" - pipe_name = "my_component" - - @Language.factory(factory_name) - class MyComponent: - def __init__(self, nlp, name=pipe_name, categories="all_categories"): - self.nlp = nlp - self.categories = categories - self.name = name - - def __call__(self, doc): - pass - - def to_disk(self, path, **kwargs): - pass - - def from_disk(self, path, **cfg): - pass - - nlp = English() - my_component = nlp.add_pipe(factory_name, name=pipe_name) - assert my_component.categories == "all_categories" - with make_tempdir() as tmpdir: - nlp.to_disk(tmpdir) - overrides = {"components": {pipe_name: {"categories": "my_categories"}}} - nlp2 = spacy.load(tmpdir, config=overrides) - assert nlp2.get_pipe(pipe_name).categories == "my_categories" - - -@pytest.mark.issue(5141) -def test_issue5141(en_vocab): - """Ensure an empty DocBin does not crash on serialization""" - doc_bin = DocBin(attrs=["DEP", "HEAD"]) - assert list(doc_bin.get_docs(en_vocab)) == [] - doc_bin_bytes = doc_bin.to_bytes() - doc_bin_2 = DocBin().from_bytes(doc_bin_bytes) - assert list(doc_bin_2.get_docs(en_vocab)) == [] - - -@pytest.mark.issue(5152) -def test_issue5152(): - # Test that the comparison between a Span and a Token, goes well - # There was a bug when the number of tokens in the span equaled the number of characters in the token (!) - nlp = English() - text = nlp("Talk about being boring!") - text_var = nlp("Talk of being boring!") - y = nlp("Let") - span = text[0:3] # Talk about being - span_2 = text[0:3] # Talk about being - span_3 = text_var[0:3] # Talk of being - token = y[0] # Let - with pytest.warns(UserWarning): - assert span.similarity(token) == 0.0 - assert span.similarity(span_2) == 1.0 - with pytest.warns(UserWarning): - assert span_2.similarity(span_3) < 1.0 - - -@pytest.mark.issue(5458) -def test_issue5458(): - # Test that the noun chuncker does not generate overlapping spans - # fmt: off - words = ["In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "."] - vocab = Vocab(strings=words) - deps = ["ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct"] - pos = ["ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT"] - heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] - # fmt: on - en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps) - en_doc.noun_chunks_iterator = noun_chunks - - # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" - nlp = English() - merge_nps = nlp.create_pipe("merge_noun_chunks") - merge_nps(en_doc) diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py deleted file mode 100644 index 87c40ec2a..000000000 --- a/spacy/tests/regression/test_issue5501-6000.py +++ /dev/null @@ -1,95 +0,0 @@ -import pytest -from numpy.testing import assert_almost_equal -from thinc.api import Config, fix_random_seed, get_current_ops - -from spacy.lang.en import English -from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config -from spacy.pipeline.textcat import single_label_cnn_config -from spacy.pipeline.textcat_multilabel import multi_label_default_config -from spacy.pipeline.textcat_multilabel import multi_label_bow_config -from spacy.pipeline.textcat_multilabel import multi_label_cnn_config -from spacy.tokens import Span -from spacy import displacy -from spacy.pipeline import merge_entities -from spacy.training import Example - - -@pytest.mark.parametrize( - "textcat_config", - [ - single_label_default_config, - single_label_bow_config, - single_label_cnn_config, - multi_label_default_config, - multi_label_bow_config, - multi_label_cnn_config, - ], -) -@pytest.mark.issue(5551) -def test_issue5551(textcat_config): - """Test that after fixing the random seed, the results of the pipeline are truly identical""" - component = "textcat" - - pipe_cfg = Config().from_str(textcat_config) - results = [] - for i in range(3): - fix_random_seed(0) - nlp = English() - text = "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g." - annots = {"cats": {"Labe1": 1.0, "Label2": 0.0, "Label3": 0.0}} - pipe = nlp.add_pipe(component, config=pipe_cfg, last=True) - for label in set(annots["cats"]): - pipe.add_label(label) - # Train - nlp.initialize() - doc = nlp.make_doc(text) - nlp.update([Example.from_dict(doc, annots)]) - # Store the result of each iteration - result = pipe.model.predict([doc]) - results.append(result[0]) - # All results should be the same because of the fixed seed - assert len(results) == 3 - ops = get_current_ops() - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]), decimal=5) - assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]), decimal=5) - - -@pytest.mark.issue(5838) -def test_issue5838(): - # Displacy's EntityRenderer break line - # not working after last entity - sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n" - nlp = English() - doc = nlp(sample_text) - doc.ents = [Span(doc, 7, 8, label="test")] - html = displacy.render(doc, style="ent") - found = html.count("
") - assert found == 4 - - -@pytest.mark.issue(5918) -def test_issue5918(): - # Test edge case when merging entities. - nlp = English() - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - {"label": "ORG", "pattern": "Digicon Inc"}, - {"label": "ORG", "pattern": "Rotan Mosle Inc's"}, - {"label": "ORG", "pattern": "Rotan Mosle Technology Partners Ltd"}, - ] - ruler.add_patterns(patterns) - - text = """ - Digicon Inc said it has completed the previously-announced disposition - of its computer systems division to an investment group led by - Rotan Mosle Inc's Rotan Mosle Technology Partners Ltd affiliate. - """ - doc = nlp(text) - assert len(doc.ents) == 3 - # make it so that the third span's head is within the entity (ent_iob=I) - # bug #5918 would wrongly transfer that I to the full entity, resulting in 2 instead of 3 final ents. - # TODO: test for logging here - # with pytest.warns(UserWarning): - # doc[29].head = doc[33] - doc = merge_entities(doc) - assert len(doc.ents) == 3 diff --git a/spacy/tests/regression/test_issue6001-6500.py b/spacy/tests/regression/test_issue6001-6500.py deleted file mode 100644 index cb27d39e4..000000000 --- a/spacy/tests/regression/test_issue6001-6500.py +++ /dev/null @@ -1,30 +0,0 @@ -from spacy.util import filter_spans -from pydantic import ValidationError -from spacy.schemas import TokenPattern, TokenPatternSchema -import pytest - - -@pytest.mark.issue(6207) -def test_issue6207(en_tokenizer): - doc = en_tokenizer("zero one two three four five six") - - # Make spans - s1 = doc[:4] - s2 = doc[3:6] # overlaps with s1 - s3 = doc[5:7] # overlaps with s2, not s1 - - result = filter_spans((s1, s2, s3)) - assert s1 in result - assert s2 not in result - assert s3 in result - - -@pytest.mark.issue(6258) -def test_issue6258(): - """Test that the non-empty constraint pattern field is respected""" - # These one is valid - TokenPatternSchema(pattern=[TokenPattern()]) - # But an empty pattern list should fail to validate - # based on the schema's constraint - with pytest.raises(ValidationError): - TokenPatternSchema(pattern=[]) diff --git a/spacy/tests/regression/test_issue6501-7000.py b/spacy/tests/regression/test_issue6501-7000.py deleted file mode 100644 index 84517d79b..000000000 --- a/spacy/tests/regression/test_issue6501-7000.py +++ /dev/null @@ -1,238 +0,0 @@ -import pytest -from spacy.lang.en import English -import numpy as np -import spacy -from spacy.tokens import Doc -from spacy.matcher import PhraseMatcher -from spacy.tokens import DocBin -from spacy.util import load_config_from_str -from spacy.training import Example -from spacy.training.initialize import init_nlp -import pickle - -from ..util import make_tempdir - - -@pytest.mark.issue(6730) -def test_issue6730(en_vocab): - """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" - from spacy.kb import KnowledgeBase - - kb = KnowledgeBase(en_vocab, entity_vector_length=3) - kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) - - with pytest.raises(ValueError): - kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) - assert kb.contains_alias("") is False - - kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) - kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) - - with make_tempdir() as tmp_dir: - kb.to_disk(tmp_dir) - kb.from_disk(tmp_dir) - assert kb.get_size_aliases() == 2 - assert set(kb.get_alias_strings()) == {"x", "y"} - - -@pytest.mark.issue(6755) -def test_issue6755(en_tokenizer): - doc = en_tokenizer("This is a magnificent sentence.") - span = doc[:0] - assert span.text_with_ws == "" - assert span.text == "" - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,label", - [("Welcome to Mumbai, my friend", 11, 17, "GPE")], -) -@pytest.mark.issue(6815) -def test_issue6815_1(sentence, start_idx, end_idx, label): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, label=label) - assert span.label_ == label - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,kb_id", [("Welcome to Mumbai, my friend", 11, 17, 5)] -) -@pytest.mark.issue(6815) -def test_issue6815_2(sentence, start_idx, end_idx, kb_id): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, kb_id=kb_id) - assert span.kb_id == kb_id - - -@pytest.mark.parametrize( - "sentence, start_idx,end_idx,vector", - [("Welcome to Mumbai, my friend", 11, 17, np.array([0.1, 0.2, 0.3]))], -) -@pytest.mark.issue(6815) -def test_issue6815_3(sentence, start_idx, end_idx, vector): - nlp = English() - doc = nlp(sentence) - span = doc[:].char_span(start_idx, end_idx, vector=vector) - assert (span.vector == vector).all() - - -@pytest.mark.issue(6839) -def test_issue6839(en_vocab): - """Ensure that PhraseMatcher accepts Span as input""" - # fmt: off - words = ["I", "like", "Spans", "and", "Docs", "in", "my", "input", ",", "and", "nothing", "else", "."] - # fmt: on - doc = Doc(en_vocab, words=words) - span = doc[:8] - pattern = Doc(en_vocab, words=["Spans", "and", "Docs"]) - matcher = PhraseMatcher(en_vocab) - matcher.add("SPACY", [pattern]) - matches = matcher(span) - assert matches - - -CONFIG_ISSUE_6908 = """ -[paths] -train = "TRAIN_PLACEHOLDER" -raw = null -init_tok2vec = null -vectors = null - -[system] -seed = 0 -gpu_allocator = null - -[nlp] -lang = "en" -pipeline = ["textcat"] -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -batch_size = 1000 - -[components] - -[components.textcat] -factory = "TEXTCAT_PLACEHOLDER" - -[corpora] - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths:train} - - -[training] -train_corpus = "corpora.train" -dev_corpus = "corpora.dev" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -frozen_components = [] -before_to_disk = null - -[pretraining] - -[initialize] -vectors = ${paths.vectors} -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.components.textcat] -labels = ['label1', 'label2'] - -[initialize.tokenizer] -""" - - -@pytest.mark.parametrize( - "component_name", - ["textcat", "textcat_multilabel"], -) -@pytest.mark.issue(6908) -def test_issue6908(component_name): - """Test intializing textcat with labels in a list""" - - def create_data(out_file): - nlp = spacy.blank("en") - doc = nlp.make_doc("Some text") - doc.cats = {"label1": 0, "label2": 1} - out_data = DocBin(docs=[doc]).to_bytes() - with out_file.open("wb") as file_: - file_.write(out_data) - - with make_tempdir() as tmp_path: - train_path = tmp_path / "train.spacy" - create_data(train_path) - config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) - config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) - config = load_config_from_str(config_str) - init_nlp(config) - - -CONFIG_ISSUE_6950 = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.ner] -factory = "ner" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -@pytest.mark.issue(6950) -def test_issue6950(): - """Test that the nlp object with initialized tok2vec with listeners pickles - correctly (and doesn't have lambdas). - """ - nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) - nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) - pickle.dumps(nlp) - nlp("hello") - pickle.dumps(nlp) diff --git a/spacy/tests/regression/test_issue7001-8000.py b/spacy/tests/regression/test_issue7001-8000.py deleted file mode 100644 index 1164e85b9..000000000 --- a/spacy/tests/regression/test_issue7001-8000.py +++ /dev/null @@ -1,288 +0,0 @@ -import pytest -from spacy.cli.evaluate import print_textcats_auc_per_cat, print_prf_per_type -from spacy.lang.en import English -from spacy.training import Example -from spacy.tokens.doc import Doc -from spacy.vocab import Vocab -from spacy.kb import KnowledgeBase -from spacy.pipeline._parser_internals.arc_eager import ArcEager -from spacy.util import load_config_from_str, load_config -from spacy.cli.init_config import fill_config -from thinc.api import Config -from wasabi import msg - -from ..util import make_tempdir - - -@pytest.mark.issue(7019) -def test_issue7019(): - scores = {"LABEL_A": 0.39829102, "LABEL_B": 0.938298329382, "LABEL_C": None} - print_textcats_auc_per_cat(msg, scores) - scores = { - "LABEL_A": {"p": 0.3420302, "r": 0.3929020, "f": 0.49823928932}, - "LABEL_B": {"p": None, "r": None, "f": None}, - } - print_prf_per_type(msg, scores, name="foo", type="bar") - - -CONFIG_7029 = """ -[nlp] -lang = "en" -pipeline = ["tok2vec", "tagger"] - -[components] - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode:width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode:width} -upstream = "*" -""" - - -@pytest.mark.issue(7029) -def test_issue7029(): - """Test that an empty document doesn't mess up an entire batch.""" - TRAIN_DATA = [ - ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), - ("Eat blue ham", {"tags": ["V", "J", "N"]}), - ] - nlp = English.from_config(load_config_from_str(CONFIG_7029)) - train_examples = [] - for t in TRAIN_DATA: - train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(50): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] - docs1 = list(nlp.pipe(texts, batch_size=1)) - docs2 = list(nlp.pipe(texts, batch_size=4)) - assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] - - -@pytest.mark.issue(7055) -def test_issue7055(): - """Test that fill-config doesn't turn sourced components into factories.""" - source_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, - "components": { - "tok2vec": {"factory": "tok2vec"}, - "tagger": {"factory": "tagger"}, - }, - } - source_nlp = English.from_config(source_cfg) - with make_tempdir() as dir_path: - # We need to create a loadable source pipeline - source_path = dir_path / "test_model" - source_nlp.to_disk(source_path) - base_cfg = { - "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, - "components": { - "tok2vec": {"source": str(source_path)}, - "tagger": {"source": str(source_path)}, - "ner": {"factory": "ner"}, - }, - } - base_cfg = Config(base_cfg) - base_path = dir_path / "base.cfg" - base_cfg.to_disk(base_path) - output_path = dir_path / "config.cfg" - fill_config(output_path, base_path, silent=True) - filled_cfg = load_config(output_path) - assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) - assert filled_cfg["components"]["tagger"]["source"] == str(source_path) - assert filled_cfg["components"]["ner"]["factory"] == "ner" - assert "model" in filled_cfg["components"]["ner"] - - -@pytest.mark.issue(7056) -def test_issue7056(): - """Test that the Unshift transition works properly, and doesn't cause - sentence segmentation errors.""" - vocab = Vocab() - ae = ArcEager( - vocab.strings, ArcEager.get_actions(left_labels=["amod"], right_labels=["pobj"]) - ) - doc = Doc(vocab, words="Severe pain , after trauma".split()) - state = ae.init_batch([doc])[0] - ae.apply_transition(state, "S") - ae.apply_transition(state, "L-amod") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "S") - ae.apply_transition(state, "R-pobj") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - ae.apply_transition(state, "D") - assert not state.eol() - - -def test_partial_links(): - # Test that having some entities on the doc without gold links, doesn't crash - TRAIN_DATA = [ - ( - "Russ Cochran his reprints include EC Comics.", - { - "links": {(0, 12): {"Q2146908": 1.0}}, - "entities": [(0, 12, "PERSON")], - "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], - }, - ) - ] - nlp = English() - vector_length = 3 - train_examples = [] - for text, annotation in TRAIN_DATA: - doc = nlp(text) - train_examples.append(Example.from_dict(doc, annotation)) - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) - return mykb - - # Create and train the Entity Linker - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # adding additional components that are required for the entity_linker - nlp.add_pipe("sentencizer", first=True) - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}, - {"label": "ORG", "pattern": [{"LOWER": "ec"}, {"LOWER": "comics"}]}, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - - # this will run the pipeline on the examples and shouldn't crash - results = nlp.evaluate(train_examples) - assert "PERSON" in results["ents_per_type"] - assert "PERSON" in results["nel_f_per_type"] - assert "ORG" in results["ents_per_type"] - assert "ORG" not in results["nel_f_per_type"] - - -@pytest.mark.issue(7065) -def test_issue7065(): - text = "Kathleen Battle sang in Mahler 's Symphony No. 8 at the Cincinnati Symphony Orchestra 's May Festival." - nlp = English() - nlp.add_pipe("sentencizer") - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - { - "label": "THING", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - } - ] - ruler.add_patterns(patterns) - - doc = nlp(text) - sentences = [s for s in doc.sents] - assert len(sentences) == 2 - sent0 = sentences[0] - ent = doc.ents[0] - assert ent.start < sent0.end < ent.end - assert sentences.index(ent.sent) == 0 - - -@pytest.mark.issue(7065) -def test_issue7065_b(): - # Test that the NEL doesn't crash when an entity crosses a sentence boundary - nlp = English() - vector_length = 3 - nlp.add_pipe("sentencizer") - text = "Mahler 's Symphony No. 8 was beautiful." - entities = [(0, 6, "PERSON"), (10, 24, "WORK")] - links = { - (0, 6): {"Q7304": 1.0, "Q270853": 0.0}, - (10, 24): {"Q7304": 0.0, "Q270853": 1.0}, - } - sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] - doc = nlp(text) - example = Example.from_dict( - doc, {"entities": entities, "links": links, "sent_starts": sent_starts} - ) - train_examples = [example] - - def create_kb(vocab): - # create artificial KB - mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) - mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) - mykb.add_alias( - alias="No. 8", - entities=["Q270853"], - probabilities=[1.0], - ) - mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) - mykb.add_alias( - alias="Mahler", - entities=["Q7304"], - probabilities=[1.0], - ) - return mykb - - # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) - entity_linker.set_kb(create_kb) - # train the NEL pipe - optimizer = nlp.initialize(get_examples=lambda: train_examples) - for i in range(2): - losses = {} - nlp.update(train_examples, sgd=optimizer, losses=losses) - - # Add a custom rule-based component to mimick NER - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]}, - { - "label": "WORK", - "pattern": [ - {"LOWER": "symphony"}, - {"LOWER": "no"}, - {"LOWER": "."}, - {"LOWER": "8"}, - ], - }, - ] - ruler = nlp.add_pipe("entity_ruler", before="entity_linker") - ruler.add_patterns(patterns) - # test the trained model - this should not throw E148 - doc = nlp(text) - assert doc diff --git a/spacy/tests/regression/test_issue7716.py b/spacy/tests/regression/test_issue7716.py deleted file mode 100644 index d9b3967ff..000000000 --- a/spacy/tests/regression/test_issue7716.py +++ /dev/null @@ -1,55 +0,0 @@ -import pytest -from thinc.api import Adam -from spacy.attrs import NORM -from spacy.vocab import Vocab -from spacy import registry -from spacy.training import Example -from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL -from spacy.tokens import Doc -from spacy.pipeline import DependencyParser - - -@pytest.fixture -def vocab(): - return Vocab(lex_attr_getters={NORM: lambda s: s}) - - -def _parser_example(parser): - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]} - return Example.from_dict(doc, gold) - - -@pytest.fixture -def parser(vocab): - vocab.strings.add("ROOT") - cfg = {"model": DEFAULT_PARSER_MODEL} - model = registry.resolve(cfg, validate=True)["model"] - parser = DependencyParser(vocab, model) - parser.cfg["token_vector_width"] = 4 - parser.cfg["hidden_width"] = 32 - # parser.add_label('right') - parser.add_label("left") - parser.initialize(lambda: [_parser_example(parser)]) - sgd = Adam(0.001) - - for i in range(10): - losses = {} - doc = Doc(vocab, words=["a", "b", "c", "d"]) - example = Example.from_dict( - doc, {"heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"]} - ) - parser.update([example], sgd=sgd, losses=losses) - return parser - - -@pytest.mark.issue(7716) -@pytest.mark.xfail(reason="Not fixed yet") -def test_partial_annotation(parser): - doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) - doc[2].is_sent_start = False - # Note that if the following line is used, then doc[2].is_sent_start == False - # doc[3].is_sent_start = False - - doc = parser(doc) - assert doc[2].is_sent_start == False diff --git a/spacy/tests/regression/test_issue8168.py b/spacy/tests/regression/test_issue8168.py deleted file mode 100644 index e3f3b5cfa..000000000 --- a/spacy/tests/regression/test_issue8168.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest -from spacy.lang.en import English - - -@pytest.mark.issue(8168) -def test_issue8168(): - nlp = English() - ruler = nlp.add_pipe("entity_ruler") - patterns = [ - {"label": "ORG", "pattern": "Apple"}, - { - "label": "GPE", - "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], - "id": "san-francisco", - }, - { - "label": "GPE", - "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], - "id": "san-francisco", - }, - ] - ruler.add_patterns(patterns) - - assert ruler._ent_ids == {8043148519967183733: ("GPE", "san-francisco")} diff --git a/spacy/tests/regression/test_issue8190.py b/spacy/tests/regression/test_issue8190.py deleted file mode 100644 index 0b2f2824b..000000000 --- a/spacy/tests/regression/test_issue8190.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -import spacy -from spacy.lang.en import English -from ..util import make_tempdir - - -@pytest.mark.issue(8190) -def test_issue8190(): - """Test that config overrides are not lost after load is complete.""" - source_cfg = { - "nlp": { - "lang": "en", - }, - "custom": {"key": "value"}, - } - source_nlp = English.from_config(source_cfg) - with make_tempdir() as dir_path: - # We need to create a loadable source pipeline - source_path = dir_path / "test_model" - source_nlp.to_disk(source_path) - nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) - - assert nlp.config["custom"]["key"] == "updated_value" diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py deleted file mode 100644 index 0370074fe..000000000 --- a/spacy/tests/regression/test_issue8216.py +++ /dev/null @@ -1,34 +0,0 @@ -import pytest - -from spacy import registry -from spacy.language import Language - - -@pytest.fixture -def nlp(): - return Language() - - -@pytest.fixture -@registry.misc("entity_ruler_patterns") -def patterns(): - return [ - {"label": "HELLO", "pattern": "hello world"}, - {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, - {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, - {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, - {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"}, - ] - - -@pytest.mark.issue(8216) -def test_entity_ruler_fix8216(nlp, patterns): - """Test that patterns don't get added excessively.""" - ruler = nlp.add_pipe("entity_ruler", config={"validate": True}) - ruler.add_patterns(patterns) - pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) - assert pattern_count > 0 - ruler.add_patterns([]) - after_count = sum(len(mm) for mm in ruler.matcher._patterns.values()) - assert after_count == pattern_count diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 102989705..1d50fd1d1 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -1,20 +1,17 @@ import pytest -from thinc.api import Config, ConfigValidationError -import spacy -from spacy.lang.en import English -from spacy.lang.de import German -from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH -from spacy.util import ( - registry, - load_model_from_config, - load_config, - load_config_from_str, -) -from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model -from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder -from spacy.schemas import ConfigSchema, ConfigSchemaPretrain from catalogue import RegistryError +from thinc.api import Config, ConfigValidationError +import spacy +from spacy.lang.de import German +from spacy.lang.en import English +from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH +from spacy.language import Language +from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed +from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model +from spacy.schemas import ConfigSchema, ConfigSchemaPretrain +from spacy.util import load_config, load_config_from_str +from spacy.util import load_model_from_config, registry from ..util import make_tempdir @@ -187,6 +184,25 @@ def my_parser(): return parser +@pytest.mark.issue(8190) +def test_issue8190(): + """Test that config overrides are not lost after load is complete.""" + source_cfg = { + "nlp": { + "lang": "en", + }, + "custom": {"key": "value"}, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + nlp = spacy.load(source_path, config={"custom": {"key": "updated_value"}}) + + assert nlp.config["custom"]["key"] == "updated_value" + + def test_create_nlp_from_config(): config = Config().from_str(nlp_config_string) with pytest.raises(ConfigValidationError): diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 23afaf26c..15bf67bfd 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,13 +1,168 @@ -import pytest -from spacy.tokens.underscore import Underscore +import copy +import pickle -import spacy +import numpy +import pytest + +from spacy.attrs import DEP, HEAD from spacy.lang.en import English -from spacy.tokens import Doc, DocBin +from spacy.language import Language +from spacy.matcher import Matcher, PhraseMatcher +from spacy.tokens import Doc +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir +@pytest.mark.issue(1727) +def test_issue1727(): + """Test that models with no pretrained vectors can be deserialized + correctly after vectors are added.""" + nlp = Language(Vocab()) + data = numpy.ones((3, 300), dtype="f") + vectors = Vectors(data=data, keys=["I", "am", "Matt"]) + tagger = nlp.create_pipe("tagger") + tagger.add_label("PRP") + assert tagger.cfg.get("pretrained_dims", 0) == 0 + tagger.vocab.vectors = vectors + with make_tempdir() as path: + tagger.to_disk(path) + tagger = nlp.create_pipe("tagger").from_disk(path) + assert tagger.cfg.get("pretrained_dims", 0) == 0 + + +@pytest.mark.issue(1799) +def test_issue1799(): + """Test sentence boundaries are deserialized correctly, even for + non-projective sentences.""" + heads_deps = numpy.asarray( + [ + [1, 397], + [4, 436], + [2, 426], + [1, 402], + [0, 8206900633647566924], + [18446744073709551615, 440], + [18446744073709551614, 442], + ], + dtype="uint64", + ) + doc = Doc(Vocab(), words="Just what I was looking for .".split()) + doc.vocab.strings.add("ROOT") + doc = doc.from_array([HEAD, DEP], heads_deps) + assert len(list(doc.sents)) == 1 + + +@pytest.mark.issue(1834) +def test_issue1834(): + """Test that sentence boundaries & parse/tag flags are not lost + during serialization.""" + words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"] + doc = Doc(Vocab(), words=words) + doc[6].is_sent_start = True + new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) + assert new_doc[6].sent_start + assert not new_doc.has_annotation("DEP") + assert not new_doc.has_annotation("TAG") + doc = Doc( + Vocab(), + words=words, + tags=["TAG"] * len(words), + heads=[0, 0, 0, 0, 0, 0, 6, 6, 6], + deps=["dep"] * len(words), + ) + new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) + assert new_doc[6].sent_start + assert new_doc.has_annotation("DEP") + assert new_doc.has_annotation("TAG") + + +@pytest.mark.issue(1883) +def test_issue1883(): + matcher = Matcher(Vocab()) + matcher.add("pat1", [[{"orth": "hello"}]]) + doc = Doc(matcher.vocab, words=["hello"]) + assert len(matcher(doc)) == 1 + new_matcher = copy.deepcopy(matcher) + new_doc = Doc(new_matcher.vocab, words=["hello"]) + assert len(new_matcher(new_doc)) == 1 + + +@pytest.mark.issue(2564) +def test_issue2564(): + """Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe.""" + nlp = Language() + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") + nlp.initialize() + doc = nlp("hello world") + assert doc.has_annotation("TAG") + docs = nlp.pipe(["hello", "world"]) + piped_doc = next(docs) + assert piped_doc.has_annotation("TAG") + + +@pytest.mark.issue(3248) +def test_issue3248_2(): + """Test that the PhraseMatcher can be pickled correctly.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")]) + matcher.add("TEST2", [nlp("d")]) + data = pickle.dumps(matcher) + new_matcher = pickle.loads(data) + assert len(new_matcher) == len(matcher) + + +@pytest.mark.issue(3289) +def test_issue3289(): + """Test that Language.to_bytes handles serializing a pipeline component + with an uninitialized model.""" + nlp = English() + nlp.add_pipe("textcat") + bytes_data = nlp.to_bytes() + new_nlp = English() + new_nlp.add_pipe("textcat") + new_nlp.from_bytes(bytes_data) + + +@pytest.mark.issue(3468) +def test_issue3468(): + """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can + be restored after serialization.""" + nlp = English() + nlp.add_pipe("sentencizer") + doc = nlp("Hello world") + assert doc[0].is_sent_start + assert doc.has_annotation("SENT_START") + assert len(list(doc.sents)) == 1 + doc_bytes = doc.to_bytes() + new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) + assert new_doc[0].is_sent_start + assert new_doc.has_annotation("SENT_START") + assert len(list(new_doc.sents)) == 1 + + +@pytest.mark.issue(3959) +def test_issue3959(): + """Ensure that a modified pos attribute is serialized correctly.""" + nlp = English() + doc = nlp( + "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" + ) + assert doc[0].pos_ == "" + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" + # usually this is already True when starting from proper models instead of blank English + with make_tempdir() as tmp_dir: + file_path = tmp_dir / "my_doc" + doc.to_disk(file_path) + doc2 = nlp("") + doc2.from_disk(file_path) + assert doc2[0].pos_ == "NOUN" + + def test_serialize_empty_doc(en_vocab): doc = Doc(en_vocab) data = doc.to_bytes() @@ -61,69 +216,3 @@ def test_serialize_doc_span_groups(en_vocab): doc.spans["content"] = [doc[0:2]] new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert len(new_doc.spans["content"]) == 1 - - -def test_serialize_doc_bin(): - doc_bin = DocBin( - attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True - ) - texts = ["Some text", "Lots of texts...", "..."] - cats = {"A": 0.5} - nlp = English() - for doc in nlp.pipe(texts): - doc.cats = cats - doc.spans["start"] = [doc[0:2]] - doc[0].norm_ = "UNUSUAL_TOKEN_NORM" - doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" - doc_bin.add(doc) - bytes_data = doc_bin.to_bytes() - - # Deserialize later, e.g. in a new process - nlp = spacy.blank("en") - doc_bin = DocBin().from_bytes(bytes_data) - reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) - for i, doc in enumerate(reloaded_docs): - assert doc.text == texts[i] - assert doc.cats == cats - assert len(doc.spans) == 1 - assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" - assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" - - -def test_serialize_doc_bin_unknown_spaces(en_vocab): - doc1 = Doc(en_vocab, words=["that", "'s"]) - assert doc1.has_unknown_spaces - assert doc1.text == "that 's " - doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) - assert not doc2.has_unknown_spaces - assert doc2.text == "that's" - - doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) - re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) - assert re_doc1.has_unknown_spaces - assert re_doc1.text == "that 's " - assert not re_doc2.has_unknown_spaces - assert re_doc2.text == "that's" - - -@pytest.mark.parametrize( - "writer_flag,reader_flag,reader_value", - [ - (True, True, "bar"), - (True, False, "bar"), - (False, True, "nothing"), - (False, False, "nothing"), - ], -) -def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): - """Test that custom extensions are correctly serialized in DocBin.""" - Doc.set_extension("foo", default="nothing") - doc = Doc(en_vocab, words=["hello", "world"]) - doc._.foo = "bar" - doc_bin_1 = DocBin(store_user_data=writer_flag) - doc_bin_1.add(doc) - doc_bin_bytes = doc_bin_1.to_bytes() - doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) - doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] - assert doc_2._.foo == reader_value - Underscore.doc_extensions = {} diff --git a/spacy/tests/serialize/test_serialize_docbin.py b/spacy/tests/serialize/test_serialize_docbin.py new file mode 100644 index 000000000..9f8e5e06b --- /dev/null +++ b/spacy/tests/serialize/test_serialize_docbin.py @@ -0,0 +1,106 @@ +import pytest + +import spacy +from spacy.lang.en import English +from spacy.tokens import Doc, DocBin +from spacy.tokens.underscore import Underscore + + +@pytest.mark.issue(4367) +def test_issue4367(): + """Test that docbin init goes well""" + DocBin() + DocBin(attrs=["LEMMA"]) + DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) + + +@pytest.mark.issue(4528) +def test_issue4528(en_vocab): + """Test that user_data is correctly serialized in DocBin.""" + doc = Doc(en_vocab, words=["hello", "world"]) + doc.user_data["foo"] = "bar" + # This is how extension attribute values are stored in the user data + doc.user_data[("._.", "foo", None, None)] = "bar" + doc_bin = DocBin(store_user_data=True) + doc_bin.add(doc) + doc_bin_bytes = doc_bin.to_bytes() + new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) + new_doc = list(new_doc_bin.get_docs(en_vocab))[0] + assert new_doc.user_data["foo"] == "bar" + assert new_doc.user_data[("._.", "foo", None, None)] == "bar" + + +@pytest.mark.issue(5141) +def test_issue5141(en_vocab): + """Ensure an empty DocBin does not crash on serialization""" + doc_bin = DocBin(attrs=["DEP", "HEAD"]) + assert list(doc_bin.get_docs(en_vocab)) == [] + doc_bin_bytes = doc_bin.to_bytes() + doc_bin_2 = DocBin().from_bytes(doc_bin_bytes) + assert list(doc_bin_2.get_docs(en_vocab)) == [] + + +def test_serialize_doc_bin(): + doc_bin = DocBin( + attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True + ) + texts = ["Some text", "Lots of texts...", "..."] + cats = {"A": 0.5} + nlp = English() + for doc in nlp.pipe(texts): + doc.cats = cats + doc.spans["start"] = [doc[0:2]] + doc[0].norm_ = "UNUSUAL_TOKEN_NORM" + doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" + doc_bin.add(doc) + bytes_data = doc_bin.to_bytes() + + # Deserialize later, e.g. in a new process + nlp = spacy.blank("en") + doc_bin = DocBin().from_bytes(bytes_data) + reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) + for i, doc in enumerate(reloaded_docs): + assert doc.text == texts[i] + assert doc.cats == cats + assert len(doc.spans) == 1 + assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" + assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID" + + +def test_serialize_doc_bin_unknown_spaces(en_vocab): + doc1 = Doc(en_vocab, words=["that", "'s"]) + assert doc1.has_unknown_spaces + assert doc1.text == "that 's " + doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) + assert not doc2.has_unknown_spaces + assert doc2.text == "that's" + + doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) + re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) + assert re_doc1.has_unknown_spaces + assert re_doc1.text == "that 's " + assert not re_doc2.has_unknown_spaces + assert re_doc2.text == "that's" + + +@pytest.mark.parametrize( + "writer_flag,reader_flag,reader_value", + [ + (True, True, "bar"), + (True, False, "bar"), + (False, True, "nothing"), + (False, False, "nothing"), + ], +) +def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): + """Test that custom extensions are correctly serialized in DocBin.""" + Doc.set_extension("foo", default="nothing") + doc = Doc(en_vocab, words=["hello", "world"]) + doc._.foo = "bar" + doc_bin_1 = DocBin(store_user_data=writer_flag) + doc_bin_1.add(doc) + doc_bin_bytes = doc_bin_1.to_bytes() + doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) + doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] + assert doc_2._.foo == reader_value + Underscore.doc_extensions = {} diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 05529f9d1..6e7fa0e4e 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,8 +1,14 @@ -import pytest import re +import pickle + +import pytest from spacy.language import Language +from spacy.lang.it import Italian +from spacy.lang.en import English from spacy.tokenizer import Tokenizer +from spacy.training import Example +from spacy.util import load_config_from_str from ..util import make_tempdir @@ -21,6 +27,71 @@ def meta_data(): } +@pytest.mark.issue(2482) +def test_issue2482(): + """Test we can serialize and deserialize a blank NER or parser model.""" + nlp = Italian() + nlp.add_pipe("ner") + b = nlp.to_bytes() + Italian().from_bytes(b) + + +CONFIG_ISSUE_6950 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +@pytest.mark.issue(6950) +def test_issue6950(): + """Test that the nlp object with initialized tok2vec with listeners pickles + correctly (and doesn't have lambdas). + """ + nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) + nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) + pickle.dumps(nlp) + nlp("hello") + pickle.dumps(nlp) + + def test_serialize_language_meta_disk(meta_data): language = Language(meta=meta_data) with make_tempdir() as d: diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index eebf72638..9fcf18e2d 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,18 +1,25 @@ +import pickle + import pytest -from spacy import registry, Vocab, load -from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import TextCategorizer, SentenceRecognizer, TrainablePipe +import srsly +from thinc.api import Linear + +import spacy +from spacy import Vocab, load, registry +from spacy.lang.en import English +from spacy.language import Language +from spacy.pipeline import DependencyParser, EntityRecognizer, EntityRuler +from spacy.pipeline import SentenceRecognizer, Tagger, TextCategorizer +from spacy.pipeline import TrainablePipe from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.pipeline.senter import DEFAULT_SENTER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL -from spacy.pipeline.senter import DEFAULT_SENTER_MODEL -from spacy.lang.en import English -from thinc.api import Linear -import spacy +from spacy.util import ensure_path, load_model +from spacy.tokens import Span from ..util import make_tempdir - test_parsers = [DependencyParser, EntityRecognizer] @@ -58,6 +65,181 @@ def taggers(en_vocab): return tagger1, tagger2 +@pytest.mark.issue(3456) +def test_issue3456(): + # this crashed because of a padding error in layer.ops.unflatten in thinc + nlp = English() + tagger = nlp.add_pipe("tagger") + tagger.add_label("A") + nlp.initialize() + list(nlp.pipe(["hi", ""])) + + +@pytest.mark.issue(3526) +def test_issue_3526_1(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + ruler_bytes = ruler.to_bytes() + assert len(ruler) == len(patterns) + assert len(ruler.labels) == 4 + assert ruler.overwrite + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(ruler_bytes) + assert len(new_ruler) == len(ruler) + assert len(new_ruler.labels) == 4 + assert new_ruler.overwrite == ruler.overwrite + assert new_ruler.ent_id_sep == ruler.ent_id_sep + + +@pytest.mark.issue(3526) +def test_issue_3526_2(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + bytes_old_style = srsly.msgpack_dumps(ruler.patterns) + new_ruler = EntityRuler(nlp) + new_ruler = new_ruler.from_bytes(bytes_old_style) + assert len(new_ruler) == len(ruler) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.issue(3526) +def test_issue_3526_3(en_vocab): + patterns = [ + {"label": "HELLO", "pattern": "hello world"}, + {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, + {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, + {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"}, + ] + nlp = Language(vocab=en_vocab) + ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) + with make_tempdir() as tmpdir: + out_file = tmpdir / "entity_ruler" + srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) + new_ruler = EntityRuler(nlp).from_disk(out_file) + for pattern in ruler.patterns: + assert pattern in new_ruler.patterns + assert len(new_ruler) == len(ruler) + assert new_ruler.overwrite is not ruler.overwrite + + +@pytest.mark.issue(3526) +def test_issue_3526_4(en_vocab): + nlp = Language(vocab=en_vocab) + patterns = [{"label": "ORG", "pattern": "Apple"}] + config = {"overwrite_ents": True} + ruler = nlp.add_pipe("entity_ruler", config=config) + ruler.add_patterns(patterns) + with make_tempdir() as tmpdir: + nlp.to_disk(tmpdir) + ruler = nlp.get_pipe("entity_ruler") + assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert ruler.overwrite is True + nlp2 = load(tmpdir) + new_ruler = nlp2.get_pipe("entity_ruler") + assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert new_ruler.overwrite is True + + +@pytest.mark.issue(4042) +def test_issue4042(): + """Test that serialization of an EntityRuler before NER works fine.""" + nlp = English() + # add ner pipe + ner = nlp.add_pipe("ner") + ner.add_label("SOME_LABEL") + nlp.initialize() + # Add entity ruler + patterns = [ + {"label": "MY_ORG", "pattern": "Apple"}, + {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, + ] + # works fine with "after" + ruler = nlp.add_pipe("entity_ruler", before="ner") + ruler.add_patterns(patterns) + doc1 = nlp("What do you think about Apple ?") + assert doc1.ents[0].label_ == "MY_ORG" + + with make_tempdir() as d: + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + nlp2 = load_model(output_dir) + doc2 = nlp2("What do you think about Apple ?") + assert doc2.ents[0].label_ == "MY_ORG" + + +@pytest.mark.issue(4042) +def test_issue4042_bug2(): + """ + Test that serialization of an NER works fine when new labels were added. + This is the second bug of two bugs underlying the issue 4042. + """ + nlp1 = English() + # add ner pipe + ner1 = nlp1.add_pipe("ner") + ner1.add_label("SOME_LABEL") + nlp1.initialize() + # add a new label to the doc + doc1 = nlp1("What do you think about Apple ?") + assert len(ner1.labels) == 1 + assert "SOME_LABEL" in ner1.labels + apple_ent = Span(doc1, 5, 6, label="MY_ORG") + doc1.ents = list(doc1.ents) + [apple_ent] + # Add the label explicitly. Previously we didn't require this. + ner1.add_label("MY_ORG") + ner1(doc1) + assert len(ner1.labels) == 2 + assert "SOME_LABEL" in ner1.labels + assert "MY_ORG" in ner1.labels + with make_tempdir() as d: + # assert IO goes fine + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + ner1.to_disk(output_dir) + config = {} + ner2 = nlp1.create_pipe("ner", config=config) + ner2.from_disk(output_dir) + assert len(ner2.labels) == 2 + + +@pytest.mark.issue(4725) +def test_issue4725_1(): + """Ensure the pickling of the NER goes well""" + vocab = Vocab(vectors_name="test_vocab_add_vector") + nlp = English(vocab=vocab) + config = { + "update_with_oracle_cut_size": 111, + } + ner = nlp.create_pipe("ner", config=config) + with make_tempdir() as tmp_path: + with (tmp_path / "ner.pkl").open("wb") as file_: + pickle.dump(ner, file_) + assert ner.cfg["update_with_oracle_cut_size"] == 111 + + with (tmp_path / "ner.pkl").open("rb") as file_: + ner2 = pickle.load(file_) + assert ner2.cfg["update_with_oracle_cut_size"] == 111 + + @pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): cfg = {"model": DEFAULT_PARSER_MODEL} diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index a9450cd04..e271f7707 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -1,9 +1,16 @@ -import pytest +import pickle import re -from spacy.util import get_lang_class -from spacy.tokenizer import Tokenizer -from ..util import make_tempdir, assert_packed_msg_equal +import pytest + +from spacy.attrs import ENT_IOB, ENT_TYPE +from spacy.lang.en import English +from spacy.tokenizer import Tokenizer +from spacy.tokens import Doc +from spacy.util import compile_infix_regex, compile_prefix_regex +from spacy.util import compile_suffix_regex, get_lang_class, load_model + +from ..util import assert_packed_msg_equal, make_tempdir def load_tokenizer(b): @@ -12,6 +19,79 @@ def load_tokenizer(b): return tok +@pytest.mark.issue(2833) +def test_issue2833(en_vocab): + """Test that a custom error is raised if a token or span is pickled.""" + doc = Doc(en_vocab, words=["Hello", "world"]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0]) + with pytest.raises(NotImplementedError): + pickle.dumps(doc[0:2]) + + +@pytest.mark.issue(3012) +def test_issue3012(en_vocab): + """Test that the is_tagged attribute doesn't get overwritten when we from_array + without tag information.""" + words = ["This", "is", "10", "%", "."] + tags = ["DT", "VBZ", "CD", "NN", "."] + pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] + ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] + doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) + assert doc.has_annotation("TAG") + expected = ("10", "NUM", "CD", "PERCENT") + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + header = [ENT_IOB, ENT_TYPE] + ent_array = doc.to_array(header) + doc.from_array(header, ent_array) + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + # Serializing then deserializing + doc_bytes = doc.to_bytes() + doc2 = Doc(en_vocab).from_bytes(doc_bytes) + assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected + + +@pytest.mark.issue(4190) +def test_issue4190(): + def customize_tokenizer(nlp): + prefix_re = compile_prefix_regex(nlp.Defaults.prefixes) + suffix_re = compile_suffix_regex(nlp.Defaults.suffixes) + infix_re = compile_infix_regex(nlp.Defaults.infixes) + # Remove all exceptions where a single letter is followed by a period (e.g. 'h.') + exceptions = { + k: v + for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() + if not (len(k) == 2 and k[1] == ".") + } + new_tokenizer = Tokenizer( + nlp.vocab, + exceptions, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=nlp.tokenizer.token_match, + ) + nlp.tokenizer = new_tokenizer + + test_string = "Test c." + # Load default language + nlp_1 = English() + doc_1a = nlp_1(test_string) + result_1a = [token.text for token in doc_1a] # noqa: F841 + # Modify tokenizer + customize_tokenizer(nlp_1) + doc_1b = nlp_1(test_string) + result_1b = [token.text for token in doc_1b] + # Save and Reload + with make_tempdir() as model_dir: + nlp_1.to_disk(model_dir) + nlp_2 = load_model(model_dir) + # This should be the modified tokenizer + doc_2 = nlp_2(test_string) + result_2 = [token.text for token in doc_2] + assert result_1b == result_2 + + def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): """Test that custom tokenizer with not all functions defined or empty properties can be serialized and deserialized correctly (see #2494, diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index ab403ab54..fd80c3d8e 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -1,17 +1,71 @@ -import pytest import pickle + +import pytest from thinc.api import get_current_ops -from spacy.vocab import Vocab + +import spacy +from spacy.lang.en import English from spacy.strings import StringStore +from spacy.tokens import Doc +from spacy.util import ensure_path, load_model from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir - test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] test_strings_attrs = [(["rats", "are", "cute"], "Hello")] +@pytest.mark.issue(599) +def test_issue599(en_vocab): + doc = Doc(en_vocab) + doc2 = Doc(doc.vocab) + doc2.from_bytes(doc.to_bytes()) + assert doc2.has_annotation("DEP") + + +@pytest.mark.issue(4054) +def test_issue4054(en_vocab): + """Test that a new blank model can be made with a vocab from file, + and that serialization does not drop the language at any point.""" + nlp1 = English() + vocab1 = nlp1.vocab + with make_tempdir() as d: + vocab_dir = ensure_path(d / "vocab") + if not vocab_dir.exists(): + vocab_dir.mkdir() + vocab1.to_disk(vocab_dir) + vocab2 = Vocab().from_disk(vocab_dir) + nlp2 = spacy.blank("en", vocab=vocab2) + nlp_dir = ensure_path(d / "nlp") + if not nlp_dir.exists(): + nlp_dir.mkdir() + nlp2.to_disk(nlp_dir) + nlp3 = load_model(nlp_dir) + assert nlp3.lang == "en" + + +@pytest.mark.issue(4133) +def test_issue4133(en_vocab): + nlp = English() + vocab_bytes = nlp.vocab.to_bytes() + words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] + pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] + doc = Doc(en_vocab, words=words) + for i, token in enumerate(doc): + token.pos_ = pos[i] + # usually this is already True when starting from proper models instead of blank English + doc_bytes = doc.to_bytes() + vocab = Vocab() + vocab = vocab.from_bytes(vocab_bytes) + doc = Doc(vocab).from_bytes(doc_bytes) + actual = [] + for token in doc: + actual.append(token.pos_) + assert actual == pos + + @pytest.mark.parametrize("text", ["rat"]) def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c6b00b140..b0862eab6 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,28 +1,103 @@ -import pytest -from click import NoSuchOption -from packaging.specifiers import SpecifierSet -from spacy.training import docs_to_json, offsets_to_biluo_tags -from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs -from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate -from spacy.lang.nl import Dutch -from spacy.util import ENV_VARS, load_model_from_config -from spacy.cli import info -from spacy.cli.init_config import init_config, RECOMMENDATIONS -from spacy.cli._util import validate_project_commands, parse_config_overrides -from spacy.cli._util import load_project_config, substitute_project_variables -from spacy.cli._util import is_subpath_of -from spacy.cli._util import string_to_list -from spacy import about -from spacy.util import get_minor_version -from spacy.cli.validate import get_model_pkgs -from spacy.cli.download import get_compatibility, get_version -from spacy.cli.package import get_third_party_dependencies -from thinc.api import ConfigValidationError, Config -import srsly import os -from .util import make_tempdir +import pytest +import srsly +from click import NoSuchOption +from packaging.specifiers import SpecifierSet +from thinc.api import Config, ConfigValidationError + +from spacy import about +from spacy.cli import info +from spacy.cli._util import is_subpath_of, load_project_config +from spacy.cli._util import parse_config_overrides, string_to_list +from spacy.cli._util import substitute_project_variables +from spacy.cli._util import validate_project_commands +from spacy.cli.download import get_compatibility, get_version +from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.package import get_third_party_dependencies +from spacy.cli.validate import get_model_pkgs +from spacy.lang.en import English +from spacy.lang.nl import Dutch +from spacy.language import Language +from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.training import Example, docs_to_json, offsets_to_biluo_tags +from spacy.training.converters import conll_ner_to_docs, conllu_to_docs +from spacy.training.converters import iob_to_docs +from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config + from ..cli.init_pipeline import _init_labels +from .util import make_tempdir + + +@pytest.mark.issue(4665) +def test_issue4665(): + """ + conllu_to_docs should not raise an exception if the HEAD column contains an + underscore + """ + input_data = """ +1 [ _ PUNCT -LRB- _ _ punct _ _ +2 This _ DET DT _ _ det _ _ +3 killing _ NOUN NN _ _ nsubj _ _ +4 of _ ADP IN _ _ case _ _ +5 a _ DET DT _ _ det _ _ +6 respected _ ADJ JJ _ _ amod _ _ +7 cleric _ NOUN NN _ _ nmod _ _ +8 will _ AUX MD _ _ aux _ _ +9 be _ AUX VB _ _ aux _ _ +10 causing _ VERB VBG _ _ root _ _ +11 us _ PRON PRP _ _ iobj _ _ +12 trouble _ NOUN NN _ _ dobj _ _ +13 for _ ADP IN _ _ case _ _ +14 years _ NOUN NNS _ _ nmod _ _ +15 to _ PART TO _ _ mark _ _ +16 come _ VERB VB _ _ acl _ _ +17 . _ PUNCT . _ _ punct _ _ +18 ] _ PUNCT -RRB- _ _ punct _ _ +""" + conllu_to_docs(input_data) + + +@pytest.mark.issue(4924) +def test_issue4924(): + nlp = Language() + example = Example.from_dict(nlp.make_doc(""), {}) + nlp.evaluate([example]) + + +@pytest.mark.issue(7055) +def test_issue7055(): + """Test that fill-config doesn't turn sourced components into factories.""" + source_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger"]}, + "components": { + "tok2vec": {"factory": "tok2vec"}, + "tagger": {"factory": "tagger"}, + }, + } + source_nlp = English.from_config(source_cfg) + with make_tempdir() as dir_path: + # We need to create a loadable source pipeline + source_path = dir_path / "test_model" + source_nlp.to_disk(source_path) + base_cfg = { + "nlp": {"lang": "en", "pipeline": ["tok2vec", "tagger", "ner"]}, + "components": { + "tok2vec": {"source": str(source_path)}, + "tagger": {"source": str(source_path)}, + "ner": {"factory": "ner"}, + }, + } + base_cfg = Config(base_cfg) + base_path = dir_path / "base.cfg" + base_cfg.to_disk(base_path) + output_path = dir_path / "config.cfg" + fill_config(output_path, base_path, silent=True) + filled_cfg = load_config(output_path) + assert filled_cfg["components"]["tok2vec"]["source"] == str(source_path) + assert filled_cfg["components"]["tagger"]["source"] == str(source_path) + assert filled_cfg["components"]["ner"]["factory"] == "ner" + assert "model" in filled_cfg["components"]["ner"] def test_cli_info(): diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 790925888..392c95e42 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,11 +1,101 @@ +import numpy import pytest from spacy import displacy from spacy.displacy.render import DependencyRenderer, EntityRenderer +from spacy.lang.en import English from spacy.lang.fa import Persian from spacy.tokens import Span, Doc +@pytest.mark.issue(2361) +def test_issue2361(de_vocab): + """Test if < is escaped when rendering""" + chars = ("<", ">", "&", """) + words = ["<", ">", "&", '"'] + doc = Doc(de_vocab, words=words, deps=["dep"] * len(words)) + html = displacy.render(doc) + for char in chars: + assert char in html + + +@pytest.mark.issue(2728) +def test_issue2728(en_vocab): + """Test that displaCy ENT visualizer escapes HTML correctly.""" + doc = Doc(en_vocab, words=["test", "", "test"]) + doc.ents = [Span(doc, 0, 1, label="TEST")] + html = displacy.render(doc, style="ent") + assert "<RELEASE>" in html + doc.ents = [Span(doc, 1, 2, label="TEST")] + html = displacy.render(doc, style="ent") + assert "<RELEASE>" in html + + +@pytest.mark.issue(3288) +def test_issue3288(en_vocab): + """Test that retokenization works correctly via displaCy when punctuation + is merged onto the preceeding token and tensor is resized.""" + words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] + heads = [1, 1, 1, 4, 4, 6, 4, 4] + deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] + doc = Doc(en_vocab, words=words, heads=heads, deps=deps) + doc.tensor = numpy.zeros((len(words), 96), dtype="float32") + displacy.render(doc) + + +@pytest.mark.issue(3531) +def test_issue3531(): + """Test that displaCy renderer doesn't require "settings" key.""" + example_dep = { + "words": [ + {"text": "But", "tag": "CCONJ"}, + {"text": "Google", "tag": "PROPN"}, + {"text": "is", "tag": "VERB"}, + {"text": "starting", "tag": "VERB"}, + {"text": "from", "tag": "ADP"}, + {"text": "behind.", "tag": "ADV"}, + ], + "arcs": [ + {"start": 0, "end": 3, "label": "cc", "dir": "left"}, + {"start": 1, "end": 3, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "aux", "dir": "left"}, + {"start": 3, "end": 4, "label": "prep", "dir": "right"}, + {"start": 4, "end": 5, "label": "pcomp", "dir": "right"}, + ], + } + example_ent = { + "text": "But Google is starting from behind.", + "ents": [{"start": 4, "end": 10, "label": "ORG"}], + } + dep_html = displacy.render(example_dep, style="dep", manual=True) + assert dep_html + ent_html = displacy.render(example_ent, style="ent", manual=True) + assert ent_html + + +@pytest.mark.issue(3882) +def test_issue3882(en_vocab): + """Test that displaCy doesn't serialize the doc.user_data when making a + copy of the Doc. + """ + doc = Doc(en_vocab, words=["Hello", "world"], deps=["dep", "dep"]) + doc.user_data["test"] = set() + displacy.parse_deps(doc) + + +@pytest.mark.issue(5838) +def test_issue5838(): + # Displacy's EntityRenderer break line + # not working after last entity + sample_text = "First line\nSecond line, with ent\nThird line\nFourth line\n" + nlp = English() + doc = nlp(sample_text) + doc.ents = [Span(doc, 7, 8, label="test")] + html = displacy.render(doc, style="ent") + found = html.count("
") + assert found == 4 + + def test_displacy_parse_ents(en_vocab): """Test that named entities on a Doc are converted into displaCy's format.""" doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index f17d5e62e..d8743d322 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -15,7 +15,8 @@ from spacy.training.batchers import minibatch_by_words from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import DEFAULT_CONFIG_PATH -from spacy.schemas import ConfigSchemaTraining +from spacy.schemas import ConfigSchemaTraining, TokenPattern, TokenPatternSchema +from pydantic import ValidationError from thinc.api import get_current_ops, NumpyOps, CupyOps @@ -33,6 +34,32 @@ def is_admin(): return admin +@pytest.mark.issue(6207) +def test_issue6207(en_tokenizer): + doc = en_tokenizer("zero one two three four five six") + + # Make spans + s1 = doc[:4] + s2 = doc[3:6] # overlaps with s1 + s3 = doc[5:7] # overlaps with s2, not s1 + + result = util.filter_spans((s1, s2, s3)) + assert s1 in result + assert s2 not in result + assert s3 in result + + +@pytest.mark.issue(6258) +def test_issue6258(): + """Test that the non-empty constraint pattern field is respected""" + # These one is valid + TokenPatternSchema(pattern=[TokenPattern()]) + # But an empty pattern list should fail to validate + # based on the schema's constraint + with pytest.raises(ValidationError): + TokenPatternSchema(pattern=[]) + + @pytest.mark.parametrize("text", ["hello/world", "hello world"]) def test_util_ensure_path_succeeds(text): path = util.ensure_path(text) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 452bcc079..c2aeffcb5 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,9 +1,283 @@ -import pytest import re -from spacy.vocab import Vocab -from spacy.tokenizer import Tokenizer -from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex + +import numpy +import pytest + from spacy.lang.en import English +from spacy.lang.de import German +from spacy.tokenizer import Tokenizer +from spacy.tokens import Doc +from spacy.training import Example +from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path +from spacy.vocab import Vocab +from spacy.symbols import ORTH + + +@pytest.mark.issue(743) +def test_issue743(): + doc = Doc(Vocab(), ["hello", "world"]) + token = doc[0] + s = set([token]) + items = list(s) + assert items[0] is token + + +@pytest.mark.issue(801) +@pytest.mark.skip( + reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218" +) +@pytest.mark.parametrize( + "text,tokens", + [ + ('"deserve,"--and', ['"', "deserve", ',"--', "and"]), + ("exception;--exclusive", ["exception", ";--", "exclusive"]), + ("day.--Is", ["day", ".--", "Is"]), + ("refinement:--just", ["refinement", ":--", "just"]), + ("memories?--To", ["memories", "?--", "To"]), + ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]), + ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]), + ], +) +def test_issue801(en_tokenizer, text, tokens): + """Test that special characters + hyphens are split correctly.""" + doc = en_tokenizer(text) + assert len(doc) == len(tokens) + assert [t.text for t in doc] == tokens + + +@pytest.mark.issue(1061) +def test_issue1061(): + """Test special-case works after tokenizing. Was caching problem.""" + text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_." + tokenizer = English().tokenizer + doc = tokenizer(text) + assert "MATH" in [w.text for w in doc] + assert "_MATH_" not in [w.text for w in doc] + + tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) + doc = tokenizer(text) + assert "_MATH_" in [w.text for w in doc] + assert "MATH" not in [w.text for w in doc] + + # For sanity, check it works when pipeline is clean. + tokenizer = English().tokenizer + tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}]) + doc = tokenizer(text) + assert "_MATH_" in [w.text for w in doc] + assert "MATH" not in [w.text for w in doc] + + +@pytest.mark.issue(1963) +def test_issue1963(en_tokenizer): + """Test that doc.merge() resizes doc.tensor""" + doc = en_tokenizer("a b c d") + doc.tensor = numpy.ones((len(doc), 128), dtype="f") + with doc.retokenize() as retokenizer: + retokenizer.merge(doc[0:2]) + assert len(doc) == 3 + assert doc.tensor.shape == (3, 128) + + +@pytest.mark.skip( + reason="Can not be fixed without variable-width look-behind (which we don't want)" +) +@pytest.mark.issue(1235) +def test_issue1235(): + """Test that g is not split of if preceded by a number and a letter""" + nlp = English() + testwords = "e2g 2g 52g" + doc = nlp(testwords) + assert len(doc) == 5 + assert doc[0].text == "e2g" + assert doc[1].text == "2" + assert doc[2].text == "g" + assert doc[3].text == "52" + assert doc[4].text == "g" + + +@pytest.mark.issue(1242) +def test_issue1242(): + nlp = English() + doc = nlp("") + assert len(doc) == 0 + docs = list(nlp.pipe(["", "hello"])) + assert len(docs[0]) == 0 + assert len(docs[1]) == 1 + + +@pytest.mark.issue(1257) +def test_issue1257(): + """Test that tokens compare correctly.""" + doc1 = Doc(Vocab(), words=["a", "b", "c"]) + doc2 = Doc(Vocab(), words=["a", "c", "e"]) + assert doc1[0] != doc2[0] + assert not doc1[0] == doc2[0] + + +@pytest.mark.issue(1375) +def test_issue1375(): + """Test that token.nbor() raises IndexError for out-of-bounds access.""" + doc = Doc(Vocab(), words=["0", "1", "2"]) + with pytest.raises(IndexError): + assert doc[0].nbor(-1) + assert doc[1].nbor(-1).text == "0" + with pytest.raises(IndexError): + assert doc[2].nbor(1) + assert doc[1].nbor(1).text == "2" + + +@pytest.mark.issue(1488) +def test_issue1488(): + """Test that tokenizer can parse DOT inside non-whitespace separators""" + prefix_re = re.compile(r"""[\[\("']""") + suffix_re = re.compile(r"""[\]\)"']""") + infix_re = re.compile(r"""[-~\.]""") + simple_url_re = re.compile(r"""^https?://""") + + def my_tokenizer(nlp): + return Tokenizer( + nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match, + ) + + nlp = English() + nlp.tokenizer = my_tokenizer(nlp) + doc = nlp("This is a test.") + for token in doc: + assert token.text + + +@pytest.mark.issue(1494) +def test_issue1494(): + """Test if infix_finditer works correctly""" + infix_re = re.compile(r"""[^a-z]""") + test_cases = [ + ("token 123test", ["token", "1", "2", "3", "test"]), + ("token 1test", ["token", "1test"]), + ("hello...test", ["hello", ".", ".", ".", "test"]), + ] + + def new_tokenizer(nlp): + return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) + + nlp = English() + nlp.tokenizer = new_tokenizer(nlp) + for text, expected in test_cases: + assert [token.text for token in nlp(text)] == expected + + +@pytest.mark.skip( + reason="Can not be fixed without iterative looping between prefix/suffix and infix" +) +@pytest.mark.issue(2070) +def test_issue2070(): + """Test that checks that a dot followed by a quote is handled + appropriately. + """ + # Problem: The dot is now properly split off, but the prefix/suffix rules + # are not applied again afterwards. This means that the quote will still be + # attached to the remaining token. + nlp = English() + doc = nlp('First sentence."A quoted sentence" he said ...') + assert len(doc) == 11 + + +@pytest.mark.issue(2926) +def test_issue2926(fr_tokenizer): + """Test that the tokenizer correctly splits tokens separated by a slash (/) + ending in a digit. + """ + doc = fr_tokenizer("Learn html5/css3/javascript/jquery") + assert len(doc) == 8 + assert doc[0].text == "Learn" + assert doc[1].text == "html5" + assert doc[2].text == "/" + assert doc[3].text == "css3" + assert doc[4].text == "/" + assert doc[5].text == "javascript" + assert doc[6].text == "/" + assert doc[7].text == "jquery" + + +@pytest.mark.parametrize( + "text", + [ + "ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume", + "oow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:", + ], +) +@pytest.mark.issue(2626) +def test_issue2626_2835(en_tokenizer, text): + """Check that sentence doesn't cause an infinite loop in the tokenizer.""" + doc = en_tokenizer(text) + assert doc + + +@pytest.mark.issue(2656) +def test_issue2656(en_tokenizer): + """Test that tokenizer correctly splits off punctuation after numbers with + decimal points. + """ + doc = en_tokenizer("I went for 40.3, and got home by 10.0.") + assert len(doc) == 11 + assert doc[0].text == "I" + assert doc[1].text == "went" + assert doc[2].text == "for" + assert doc[3].text == "40.3" + assert doc[4].text == "," + assert doc[5].text == "and" + assert doc[6].text == "got" + assert doc[7].text == "home" + assert doc[8].text == "by" + assert doc[9].text == "10.0" + assert doc[10].text == "." + + +@pytest.mark.issue(2754) +def test_issue2754(en_tokenizer): + """Test that words like 'a' and 'a.m.' don't get exceptional norm values.""" + a = en_tokenizer("a") + assert a[0].norm_ == "a" + am = en_tokenizer("am") + assert am[0].norm_ == "am" + + +@pytest.mark.issue(3002) +def test_issue3002(): + """Test that the tokenizer doesn't hang on a long list of dots""" + nlp = German() + doc = nlp( + "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl" + ) + assert len(doc) == 5 + + +@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot") +@pytest.mark.issue(3449) +def test_issue3449(): + nlp = English() + nlp.add_pipe("sentencizer") + text1 = "He gave the ball to I. Do you want to go to the movies with I?" + text2 = "He gave the ball to I. Do you want to go to the movies with I?" + text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" + t1 = nlp(text1) + t2 = nlp(text2) + t3 = nlp(text3) + assert t1[5].text == "I" + assert t2[5].text == "I" + assert t3[5].text == "I" + + +@pytest.mark.parametrize( + "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] +) +def test_gold_misaligned(en_tokenizer, text, words): + doc = en_tokenizer(text) + Example.from_dict(doc, {"words": words}) def test_tokenizer_handles_no_word(tokenizer): diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 68f86190b..0d73300d8 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -1,15 +1,18 @@ +import random + import numpy -from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment -from spacy.training import biluo_tags_to_spans, iob_to_biluo -from spacy.training import Corpus, docs_to_json, Example -from spacy.training.align import get_alignments -from spacy.training.converters import json_to_docs -from spacy.lang.en import English -from spacy.tokens import Doc, DocBin -from spacy.util import get_words_and_spaces, minibatch -from thinc.api import compounding import pytest import srsly +from spacy.lang.en import English +from spacy.tokens import Doc, DocBin +from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets +from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo +from spacy.training import offsets_to_biluo_tags +from spacy.training.align import get_alignments +from spacy.training.converters import json_to_docs +from spacy.util import get_words_and_spaces, load_model_from_path, minibatch +from spacy.util import load_config_from_str +from thinc.api import compounding from ..util import make_tempdir @@ -68,6 +71,207 @@ def vocab(): return nlp.vocab +@pytest.mark.issue(999) +def test_issue999(): + """Test that adding entities and resuming training works passably OK. + There are two issues here: + 1) We have to re-add labels. This isn't very nice. + 2) There's no way to set the learning rate for the weight update, so we + end up out-of-scale, causing it to learn too fast. + """ + TRAIN_DATA = [ + ["hey", []], + ["howdy", []], + ["hey there", []], + ["hello", []], + ["hi", []], + ["i'm looking for a place to eat", []], + ["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]], + ["show me chinese restaurants", [(8, 15, "CUISINE")]], + ["show me chines restaurants", [(8, 14, "CUISINE")]], + ] + nlp = English() + ner = nlp.add_pipe("ner") + for _, offsets in TRAIN_DATA: + for start, end, label in offsets: + ner.add_label(label) + nlp.initialize() + for itn in range(20): + random.shuffle(TRAIN_DATA) + for raw_text, entity_offsets in TRAIN_DATA: + example = Example.from_dict( + nlp.make_doc(raw_text), {"entities": entity_offsets} + ) + nlp.update([example]) + + with make_tempdir() as model_dir: + nlp.to_disk(model_dir) + nlp2 = load_model_from_path(model_dir) + + for raw_text, entity_offsets in TRAIN_DATA: + doc = nlp2(raw_text) + ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} + for start, end, label in entity_offsets: + if (start, end) in ents: + assert ents[(start, end)] == label + break + else: + if entity_offsets: + raise Exception(ents) + + +@pytest.mark.issue(4402) +def test_issue4402(): + json_data = { + "id": 0, + "paragraphs": [ + { + "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "How", "ner": "O"}, + {"id": 1, "orth": "should", "ner": "O"}, + {"id": 2, "orth": "I", "ner": "O"}, + {"id": 3, "orth": "cook", "ner": "O"}, + {"id": 4, "orth": "bacon", "ner": "O"}, + {"id": 5, "orth": "in", "ner": "O"}, + {"id": 6, "orth": "an", "ner": "O"}, + {"id": 7, "orth": "oven", "ner": "O"}, + {"id": 8, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + { + "tokens": [ + {"id": 9, "orth": "\n", "ner": "O"}, + {"id": 10, "orth": "I", "ner": "O"}, + {"id": 11, "orth": "'ve", "ner": "O"}, + {"id": 12, "orth": "heard", "ner": "O"}, + {"id": 13, "orth": "of", "ner": "O"}, + {"id": 14, "orth": "people", "ner": "O"}, + {"id": 15, "orth": "cooking", "ner": "O"}, + {"id": 16, "orth": "bacon", "ner": "O"}, + {"id": 17, "orth": "in", "ner": "O"}, + {"id": 18, "orth": "an", "ner": "O"}, + {"id": 19, "orth": "oven", "ner": "O"}, + {"id": 20, "orth": ".", "ner": "O"}, + ], + "brackets": [], + }, + ], + "cats": [ + {"label": "baking", "value": 1.0}, + {"label": "not_baking", "value": 0.0}, + ], + }, + { + "raw": "What is the difference between white and brown eggs?\n", + "sentences": [ + { + "tokens": [ + {"id": 0, "orth": "What", "ner": "O"}, + {"id": 1, "orth": "is", "ner": "O"}, + {"id": 2, "orth": "the", "ner": "O"}, + {"id": 3, "orth": "difference", "ner": "O"}, + {"id": 4, "orth": "between", "ner": "O"}, + {"id": 5, "orth": "white", "ner": "O"}, + {"id": 6, "orth": "and", "ner": "O"}, + {"id": 7, "orth": "brown", "ner": "O"}, + {"id": 8, "orth": "eggs", "ner": "O"}, + {"id": 9, "orth": "?", "ner": "O"}, + ], + "brackets": [], + }, + {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []}, + ], + "cats": [ + {"label": "baking", "value": 0.0}, + {"label": "not_baking", "value": 1.0}, + ], + }, + ], + } + nlp = English() + attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] + with make_tempdir() as tmpdir: + output_file = tmpdir / "test4402.spacy" + docs = json_to_docs([json_data]) + data = DocBin(docs=docs, attrs=attrs).to_bytes() + with output_file.open("wb") as file_: + file_.write(data) + reader = Corpus(output_file) + train_data = list(reader(nlp)) + assert len(train_data) == 2 + + split_train_data = [] + for eg in train_data: + split_train_data.extend(eg.split_sents()) + assert len(split_train_data) == 4 + + +CONFIG_7029 = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +@pytest.mark.issue(7029) +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch.""" + TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), + ] + nlp = English.from_config(load_config_from_str(CONFIG_7029)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] + + def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] spaces = [True, True, True, False, True] diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index b6fee6628..d91f41db3 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -1,7 +1,25 @@ -import pytest import numpy +import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT +from spacy.lookups import Lookups +from spacy.tokens import Doc from spacy.util import OOV_RANK +from spacy.vocab import Vocab + + +@pytest.mark.issue(361) +@pytest.mark.parametrize("text1,text2", [("cat", "dog")]) +def test_issue361(en_vocab, text1, text2): + """Test Issue #361: Equality of lexemes""" + assert en_vocab[text1] == en_vocab[text1] + assert en_vocab[text1] != en_vocab[text2] + + +@pytest.mark.issue(600) +def test_issue600(): + vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) + doc = Doc(vocab, words=["hello"]) + doc[0].tag_ = "NN" @pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)]) diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index b5f7303b5..3b9308f4d 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -16,6 +16,16 @@ def vocab(en_vocab, vectors): return en_vocab +@pytest.mark.issue(2219) +def test_issue2219(en_vocab): + """Test if indexing issue still occurs during Token-Token similarity""" + vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])] + add_vecs_to_vocab(en_vocab, vectors) + [(word1, vec1), (word2, vec2)] = vectors + doc = Doc(en_vocab, words=[word1, word2]) + assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) + + def test_vectors_similarity_LL(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors lex1 = vocab[word1] diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index f2e74c3c9..9dc40b499 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,14 +1,15 @@ -import pytest import numpy -from numpy.testing import assert_allclose, assert_equal, assert_almost_equal -from thinc.api import get_current_ops +import pytest +from numpy.testing import assert_allclose, assert_almost_equal, assert_equal +from thinc.api import NumpyOps, get_current_ops + from spacy.lang.en import English -from spacy.vocab import Vocab -from spacy.vectors import Vectors -from spacy.tokenizer import Tokenizer from spacy.strings import hash_string # type: ignore +from spacy.tokenizer import Tokenizer from spacy.tokens import Doc from spacy.training.initialize import convert_vectors +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import add_vecs_to_vocab, get_cosine, make_tempdir @@ -65,6 +66,79 @@ def tokenizer_v(vocab): return Tokenizer(vocab, {}, None, None, None) +@pytest.mark.issue(1518) +def test_issue1518(): + """Test vectors.resize() works.""" + vectors = Vectors(shape=(10, 10)) + vectors.add("hello", row=2) + vectors.resize((5, 9)) + + +@pytest.mark.issue(1539) +def test_issue1539(): + """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" + v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) + v.resize((100, 100)) + + +@pytest.mark.issue(1807) +def test_issue1807(): + """Test vocab.set_vector also adds the word to the vocab.""" + vocab = Vocab(vectors_name="test_issue1807") + assert "hello" not in vocab + vocab.set_vector("hello", numpy.ones((50,), dtype="f")) + assert "hello" in vocab + + +@pytest.mark.issue(2871) +def test_issue2871(): + """Test that vectors recover the correct key for spaCy reserved words.""" + words = ["dog", "cat", "SUFFIX"] + vocab = Vocab(vectors_name="test_issue2871") + vocab.vectors.resize(shape=(3, 10)) + vector_data = numpy.zeros((3, 10), dtype="f") + for word in words: + _ = vocab[word] # noqa: F841 + vocab.set_vector(word, vector_data[0]) + vocab.vectors.name = "dummy_vectors" + assert vocab["dog"].rank == 0 + assert vocab["cat"].rank == 1 + assert vocab["SUFFIX"].rank == 2 + assert vocab.vectors.find(key="dog") == 0 + assert vocab.vectors.find(key="cat") == 1 + assert vocab.vectors.find(key="SUFFIX") == 2 + + +@pytest.mark.issue(3412) +def test_issue3412(): + data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f") + vectors = Vectors(data=data, keys=["A", "B", "C"]) + keys, best_rows, scores = vectors.most_similar( + numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f") + ) + assert best_rows[0] == 2 + + +@pytest.mark.issue(4725) +def test_issue4725_2(): + if isinstance(get_current_ops, NumpyOps): + # ensures that this runs correctly and doesn't hang or crash because of the global vectors + # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), + # or because of issues with pickling the NER (cf test_issue4725_1) + vocab = Vocab(vectors_name="test_vocab_add_vector") + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + nlp = English(vocab=vocab) + nlp.add_pipe("ner") + nlp.initialize() + docs = ["Kurt is in London."] * 10 + for _ in nlp.pipe(docs, batch_size=2, n_process=2): + pass + + def test_init_vectors_with_resize_shape(strings, resize_data): v = Vectors(shape=(len(strings), 3)) v.resize(shape=resize_data.shape) diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index 56ef1d108..16cf80a08 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,6 +1,19 @@ import pytest -from spacy.attrs import LEMMA, ORTH, IS_ALPHA +from spacy.attrs import IS_ALPHA, LEMMA, ORTH from spacy.parts_of_speech import NOUN, VERB +from spacy.vocab import Vocab + + +@pytest.mark.issue(1868) +def test_issue1868(): + """Test Vocab.__contains__ works with int keys.""" + vocab = Vocab() + lex = vocab["hello"] + assert lex.orth in vocab + assert lex.orth_ in vocab + assert "some string" not in vocab + int_id = vocab.strings.add("some string") + assert int_id not in vocab @pytest.mark.parametrize(