import pytest import gc import numpy import copy from spacy.gold import Example from spacy.lang.en import English from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.lex_attrs import is_stop from spacy.vectors import Vectors from spacy.vocab import Vocab from spacy.language import Language from spacy.pipeline.defaults import default_ner, default_tagger from spacy.tokens import Doc, Span, Token from spacy.pipeline import Tagger, EntityRecognizer from spacy.attrs import HEAD, DEP from spacy.matcher import Matcher from ..util import make_tempdir def test_issue1506(): def string_generator(): for _ in range(10001): yield "It's sentence produced by that bug." for _ in range(10001): yield "I erase some hbdsaj lemmas." for _ in range(10001): yield "I erase lemmas." for _ in range(10001): yield "It's sentence produced by that bug." for _ in range(10001): yield "It's sentence produced by that bug." nlp = English() for i, d in enumerate(nlp.pipe(string_generator())): # We should run cleanup more than one time to actually cleanup data. # In first run — clean up only mark strings as «not hitted». if i == 10000 or i == 20000 or i == 30000: gc.collect() for t in d: str(t.lemma_) def test_issue1518(): """Test vectors.resize() works.""" vectors = Vectors(shape=(10, 10)) vectors.add("hello", row=2) vectors.resize((5, 9)) def test_issue1537(): """Test that Span.as_doc() doesn't segfault.""" string = "The sky is blue . The man is pink . The dog is purple ." doc = Doc(Vocab(), words=string.split()) doc[0].sent_start = True for word in doc[1:]: if word.nbor(-1).text == ".": word.sent_start = True else: word.sent_start = False sents = list(doc.sents) sent0 = sents[0].as_doc() sent1 = sents[1].as_doc() assert isinstance(sent0, Doc) assert isinstance(sent1, Doc) # TODO: Currently segfaulting, due to l_edge and r_edge misalignment # def test_issue1537_model(): # nlp = load_spacy('en') # doc = nlp('The sky is blue. The man is pink. The dog is purple.') # sents = [s.as_doc() for s in doc.sents] # print(list(sents[0].noun_chunks)) # print(list(sents[1].noun_chunks)) def test_issue1539(): """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) v.resize((100, 100)) def test_issue1547(): """Test that entity labels still match after merging tokens.""" words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] doc = Doc(Vocab(), words=words) doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] with doc.retokenize() as retokenizer: retokenizer.merge(doc[5:7]) assert [ent.text for ent in doc.ents] def test_issue1612(en_tokenizer): doc = en_tokenizer("The black cat purrs.") span = doc[1:3] assert span.orth_ == span.text def test_issue1654(): nlp = Language(Vocab()) assert not nlp.pipeline nlp.add_pipe(lambda doc: doc, name="1") nlp.add_pipe(lambda doc: doc, name="2", after="1") nlp.add_pipe(lambda doc: doc, name="3", after="2") assert nlp.pipe_names == ["1", "2", "3"] nlp2 = Language(Vocab()) assert not nlp2.pipeline nlp2.add_pipe(lambda doc: doc, name="3") nlp2.add_pipe(lambda doc: doc, name="2", before="3") nlp2.add_pipe(lambda doc: doc, name="1", before="2") assert nlp2.pipe_names == ["1", "2", "3"] @pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) def test_issue1698(en_tokenizer, text): doc = en_tokenizer(text) assert len(doc) == 1 assert not doc[0].like_url def test_issue1727(): """Test that models with no pretrained vectors can be deserialized correctly after vectors are added.""" data = numpy.ones((3, 300), dtype="f") vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = Tagger(Vocab(), default_tagger()) tagger.add_label("PRP") with pytest.warns(UserWarning): tagger.begin_training() assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) tagger = Tagger(Vocab(), default_tagger()).from_disk(path) assert tagger.cfg.get("pretrained_dims", 0) == 0 def test_issue1757(): """Test comparison against None doesn't cause segfault.""" doc = Doc(Vocab(), words=["a", "b", "c"]) assert not doc[0] < None assert not doc[0] is None assert doc[0] >= None assert not doc[:2] < None assert not doc[:2] is None assert doc[:2] >= None assert not doc.vocab["a"] is None assert not doc.vocab["a"] < None def test_issue1758(en_tokenizer): """Test that "would've" is handled by the English tokenizer exceptions.""" tokens = en_tokenizer("would've") assert len(tokens) == 2 assert tokens[0].tag_ == "MD" assert tokens[1].lemma_ == "have" def test_issue1773(en_tokenizer): """Test that spaces don't receive a POS but no TAG. This is the root cause of the serialization issue reported in #1773.""" doc = en_tokenizer("\n") if doc[0].pos_ == "SPACE": assert doc[0].tag_ != "" def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" heads_deps = numpy.asarray( [ [1, 397], [4, 436], [2, 426], [1, 402], [0, 8206900633647566924], [18446744073709551615, 440], [18446744073709551614, 442], ], dtype="uint64", ) doc = Doc(Vocab(), words="Just what I was looking for .".split()) doc.vocab.strings.add("ROOT") doc = doc.from_array([HEAD, DEP], heads_deps) assert len(list(doc.sents)) == 1 def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab(vectors_name="test_issue1807") assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" string = "This is a first sentence . And another one" doc = Doc(Vocab(), words=string.split()) doc[6].sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.is_parsed assert not new_doc.is_tagged doc.is_parsed = True doc.is_tagged = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc.is_parsed assert new_doc.is_tagged def test_issue1868(): """Test Vocab.__contains__ works with int keys.""" vocab = Vocab() lex = vocab["hello"] assert lex.orth in vocab assert lex.orth_ in vocab assert "some string" not in vocab int_id = vocab.strings.add("some string") assert int_id not in vocab def test_issue1883(): matcher = Matcher(Vocab()) matcher.add("pat1", [[{"orth": "hello"}]]) doc = Doc(matcher.vocab, words=["hello"]) assert len(matcher(doc)) == 1 new_matcher = copy.deepcopy(matcher) new_doc = Doc(new_matcher.vocab, words=["hello"]) assert len(new_matcher(new_doc)) == 1 @pytest.mark.parametrize("word", ["the"]) def test_issue1889(word): assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) @pytest.mark.skip(reason="obsolete with the config refactor of v.3") def test_issue1915(): cfg = {"hidden_depth": 2} # should error out nlp = Language() nlp.add_pipe(nlp.create_pipe("ner")) nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg) def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]]) doc = Doc(matcher.vocab, words=["a", "a", "a"]) matches = matcher(doc) # we should see two overlapping matches here assert len(matches) == 2 assert matches[0][1:] == (0, 2) assert matches[1][1:] == (1, 3) def test_issue1963(en_tokenizer): """Test that doc.merge() resizes doc.tensor""" doc = en_tokenizer("a b c d") doc.tensor = numpy.ones((len(doc), 128), dtype="f") with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2]) assert len(doc) == 3 assert doc.tensor.shape == (3, 128) @pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): config = { "learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0, } ner = EntityRecognizer(Vocab(), default_ner(), **config) example = Example(doc=None) example.set_token_annotation( ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label] ) ner.moves.get_actions(gold_parses=[example]) def test_issue1971(en_vocab): # Possibly related to #2675 and #2671? matcher = Matcher(en_vocab) pattern = [ {"ORTH": "Doe"}, {"ORTH": "!", "OP": "?"}, {"_": {"optional": True}, "OP": "?"}, {"ORTH": "!", "OP": "?"}, ] Token.set_extension("optional", default=False) matcher.add("TEST", [pattern]) doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"]) # We could also assert length 1 here, but this is more conclusive, because # the real problem here is that it returns a duplicate match for a match_id # that's not actually in the vocab! matches = matcher(doc) assert all([match_id in en_vocab.strings for match_id, start, end in matches]) def test_issue_1971_2(en_vocab): matcher = Matcher(en_vocab) pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}] pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}] doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"]) matcher.add("TEST1", [pattern1, pattern2]) matches = matcher(doc) assert len(matches) == 2 def test_issue_1971_3(en_vocab): """Test that pattern matches correctly for multiple extension attributes.""" Token.set_extension("a", default=1, force=True) Token.set_extension("b", default=2, force=True) doc = Doc(en_vocab, words=["hello", "world"]) matcher = Matcher(en_vocab) matcher.add("A", [[{"_": {"a": 1}}]]) matcher.add("B", [[{"_": {"b": 2}}]]) matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc)) assert len(matches) == 4 assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)]) def test_issue_1971_4(en_vocab): """Test that pattern matches correctly with multiple extension attribute values on a single token. """ Token.set_extension("ext_a", default="str_a", force=True) Token.set_extension("ext_b", default="str_b", force=True) matcher = Matcher(en_vocab) doc = Doc(en_vocab, words=["this", "is", "text"]) pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3 matcher.add("TEST", [pattern]) matches = matcher(doc) # Uncommenting this caused a segmentation fault assert len(matches) == 1 assert matches[0] == (en_vocab.strings["TEST"], 0, 3)