spaCy/spacy/tests/regression/test_issue1501-2000.py

# coding: utf8
from __future__ import unicode_literals

import pytest
import gc
import numpy
import copy
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.tokens import Doc, Span
from spacy.pipeline import Tagger, EntityRecognizer
from spacy.attrs import HEAD, DEP
from spacy.matcher import Matcher

from ..util import make_tempdir


def test_issue1506():
    def string_generator():
        for _ in range(10001):
            yield "It's sentence produced by that bug."
        for _ in range(10001):
            yield "I erase some hbdsaj lemmas."
        for _ in range(10001):
            yield "I erase lemmas."
        for _ in range(10001):
            yield "It's sentence produced by that bug."
        for _ in range(10001):
            yield "It's sentence produced by that bug."

    nlp = English()
    for i, d in enumerate(nlp.pipe(string_generator())):
        # We should run cleanup more than one time to actually cleanup data.
        # In first run — clean up only mark strings as «not hitted».
        if i == 10000 or i == 20000 or i == 30000:
            gc.collect()
        for t in d:
            str(t.lemma_)


def test_issue1518():
    """Test vectors.resize() works."""
    vectors = Vectors(shape=(10, 10))
    vectors.add('hello', row=2)
    vectors.resize((5, 9))


def test_issue1537():
    """Test that Span.as_doc() doesn't segfault."""
    string = 'The sky is blue . The man is pink . The dog is purple .'
    doc = Doc(Vocab(), words=string.split())
    doc[0].sent_start = True
    for word in doc[1:]:
        if word.nbor(-1).text == '.':
            word.sent_start = True
        else:
            word.sent_start = False
    sents = list(doc.sents)
    sent0 = sents[0].as_doc()
    sent1 = sents[1].as_doc()
    assert isinstance(sent0, Doc)
    assert isinstance(sent1, Doc)


# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
#def test_issue1537_model():
#    nlp = load_spacy('en')
#    doc = nlp('The sky is blue. The man is pink. The dog is purple.')
#    sents = [s.as_doc() for s in doc.sents]
#    print(list(sents[0].noun_chunks))
#    print(list(sents[1].noun_chunks))


def test_issue1539():
    """Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
    v = Vectors(shape=(10, 10), keys=[5,3,98,100])
    v.resize((100,100))


def test_issue1547():
    """Test that entity labels still match after merging tokens."""
    words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
    doc[5:7].merge()
    assert [ent.text for ent in doc.ents]


def test_issue1612(en_tokenizer):
    doc = en_tokenizer('The black cat purrs.')
    span = doc[1: 3]
    assert span.orth_ == span.text


def test_issue1654():
    nlp = Language(Vocab())
    assert not nlp.pipeline
    nlp.add_pipe(lambda doc: doc, name='1')
    nlp.add_pipe(lambda doc: doc, name='2', after='1')
    nlp.add_pipe(lambda doc: doc, name='3', after='2')
    assert nlp.pipe_names == ['1', '2', '3']
    nlp2 = Language(Vocab())
    assert not nlp2.pipeline
    nlp2.add_pipe(lambda doc: doc, name='3')
    nlp2.add_pipe(lambda doc: doc, name='2', before='3')
    nlp2.add_pipe(lambda doc: doc, name='1', before='2')
    assert nlp2.pipe_names == ['1', '2', '3']


@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
def test_issue1698(en_tokenizer, text):
    doc = en_tokenizer(text)
    assert len(doc) == 1
    assert not doc[0].like_url


def test_issue1727():
    """Test that models with no pretrained vectors can be deserialized
    correctly after vectors are added."""
    data = numpy.ones((3, 300), dtype='f')
    vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
    tagger = Tagger(Vocab())
    tagger.add_label('PRP')
    tagger.begin_training()
    assert tagger.cfg.get('pretrained_dims', 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
        tagger.to_disk(path)
        tagger = Tagger(Vocab()).from_disk(path)
        assert tagger.cfg.get('pretrained_dims', 0) == 0


def test_issue1757():
    """Test comparison against None doesn't cause segfault."""
    doc = Doc(Vocab(), words=['a', 'b', 'c'])
    assert not doc[0] < None
    assert not doc[0] == None
    assert doc[0] >= None
    assert not doc[:2] < None
    assert not doc[:2] == None
    assert doc[:2] >= None
    assert not doc.vocab['a'] == None
    assert not doc.vocab['a'] < None


def test_issue1758(en_tokenizer):
    """Test that "would've" is handled by the English tokenizer exceptions."""
    tokens = en_tokenizer("would've")
    assert len(tokens) == 2
    assert tokens[0].tag_ == "MD"
    assert tokens[1].lemma_ == "have"


def test_issue1799():
    """Test sentence boundaries are deserialized correctly, even for
    non-projective sentences."""
    heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
                                [0, 8206900633647566924], [18446744073709551615, 440],
                                [18446744073709551614, 442]], dtype='uint64')
    doc = Doc(Vocab(), words='Just what I was looking for .'.split())
    doc.vocab.strings.add('ROOT')
    doc = doc.from_array([HEAD, DEP], heads_deps)
    assert len(list(doc.sents)) == 1


def test_issue1807():
    """Test vocab.set_vector also adds the word to the vocab."""
    vocab = Vocab()
    assert 'hello' not in vocab
    vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
    assert 'hello' in vocab


def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged


def test_issue1868():
    """Test Vocab.__contains__ works with int keys."""
    vocab = Vocab()
    lex = vocab['hello']
    assert lex.orth in vocab
    assert lex.orth_ in vocab
    assert 'some string' not in vocab
    int_id = vocab.strings.add('some string')
    assert int_id not in vocab


def test_issue1883():
    matcher = Matcher(Vocab())
    matcher.add('pat1', None, [{'orth': 'hello'}])
    doc = Doc(matcher.vocab, words=['hello'])
    assert len(matcher(doc)) == 1
    new_matcher = copy.deepcopy(matcher)
    new_doc = Doc(new_matcher.vocab, words=['hello'])
    assert len(new_matcher(new_doc)) == 1


@pytest.mark.parametrize('word', ['the'])
def test_issue1889(word):
    assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)


def test_issue1915():
    cfg = {'hidden_depth': 2}  # should error out
    nlp = Language()
    nlp.add_pipe(nlp.create_pipe('ner'))
    nlp.get_pipe('ner').add_label('answer')
    with pytest.raises(ValueError):
        nlp.begin_training(**cfg)


def test_issue1945():
    """Test regression in Matcher introduced in v2.0.6."""
    matcher = Matcher(Vocab())
    matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
    doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
    matches = matcher(doc)  # we should see two overlapping matches here
    assert len(matches) == 2
    assert matches[0][1:] == (0, 2)
    assert matches[1][1:] == (1, 3)


@pytest.mark.parametrize('label', ['U-JOB-NAME'])
def test_issue1967(label):
    ner = EntityRecognizer(Vocab())
    entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
    gold_parses = [(None, [(entry, None)])]
    ner.moves.get_actions(gold_parses=gold_parses)
💫 Refactor test suite (#2568) ## Description Related issues: #2379 (should be fixed by separating model tests) * total execution time down from > 300 seconds to under 60 seconds 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-07-25 00:38:44 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`import pytest`
			`import gc`
			`import numpy`
			`import copy`
			`from spacy.lang.en import English`
			`from spacy.lang.en.stop_words import STOP_WORDS`
			`from spacy.lang.lex_attrs import is_stop`
			`from spacy.vectors import Vectors`
			`from spacy.vocab import Vocab`
			`from spacy.language import Language`
			`from spacy.tokens import Doc, Span`
			`from spacy.pipeline import Tagger, EntityRecognizer`
			`from spacy.attrs import HEAD, DEP`
			`from spacy.matcher import Matcher`

			`from ..util import make_tempdir`


			`def test_issue1506():`
			`def string_generator():`
			`for _ in range(10001):`
			`yield "It's sentence produced by that bug."`
			`for _ in range(10001):`
			`yield "I erase some hbdsaj lemmas."`
			`for _ in range(10001):`
			`yield "I erase lemmas."`
			`for _ in range(10001):`
			`yield "It's sentence produced by that bug."`
			`for _ in range(10001):`
			`yield "It's sentence produced by that bug."`

			`nlp = English()`
			`for i, d in enumerate(nlp.pipe(string_generator())):`
			`# We should run cleanup more than one time to actually cleanup data.`
			`# In first run — clean up only mark strings as «not hitted».`
			`if i == 10000 or i == 20000 or i == 30000:`
			`gc.collect()`
			`for t in d:`
			`str(t.lemma_)`


			`def test_issue1518():`
			`"""Test vectors.resize() works."""`
			`vectors = Vectors(shape=(10, 10))`
			`vectors.add('hello', row=2)`
			`vectors.resize((5, 9))`


			`def test_issue1537():`
			`"""Test that Span.as_doc() doesn't segfault."""`
			`string = 'The sky is blue . The man is pink . The dog is purple .'`
			`doc = Doc(Vocab(), words=string.split())`
			`doc[0].sent_start = True`
			`for word in doc[1:]:`
			`if word.nbor(-1).text == '.':`
			`word.sent_start = True`
			`else:`
			`word.sent_start = False`
			`sents = list(doc.sents)`
			`sent0 = sents[0].as_doc()`
			`sent1 = sents[1].as_doc()`
			`assert isinstance(sent0, Doc)`
			`assert isinstance(sent1, Doc)`


			`# TODO: Currently segfaulting, due to l_edge and r_edge misalignment`
			`#def test_issue1537_model():`
			`# nlp = load_spacy('en')`
			`# doc = nlp('The sky is blue. The man is pink. The dog is purple.')`
			`# sents = [s.as_doc() for s in doc.sents]`
			`# print(list(sents[0].noun_chunks))`
			`# print(list(sents[1].noun_chunks))`


			`def test_issue1539():`
			`"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""`
			`v = Vectors(shape=(10, 10), keys=[5,3,98,100])`
			`v.resize((100,100))`


			`def test_issue1547():`
			`"""Test that entity labels still match after merging tokens."""`
			`words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']`
			`doc = Doc(Vocab(), words=words)`
			`doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]`
			`doc[5:7].merge()`
			`assert [ent.text for ent in doc.ents]`


			`def test_issue1612(en_tokenizer):`
			`doc = en_tokenizer('The black cat purrs.')`
			`span = doc[1: 3]`
			`assert span.orth_ == span.text`


			`def test_issue1654():`
			`nlp = Language(Vocab())`
			`assert not nlp.pipeline`
			`nlp.add_pipe(lambda doc: doc, name='1')`
			`nlp.add_pipe(lambda doc: doc, name='2', after='1')`
			`nlp.add_pipe(lambda doc: doc, name='3', after='2')`
			`assert nlp.pipe_names == ['1', '2', '3']`
			`nlp2 = Language(Vocab())`
			`assert not nlp2.pipeline`
			`nlp2.add_pipe(lambda doc: doc, name='3')`
			`nlp2.add_pipe(lambda doc: doc, name='2', before='3')`
			`nlp2.add_pipe(lambda doc: doc, name='1', before='2')`
			`assert nlp2.pipe_names == ['1', '2', '3']`


			`@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])`
			`def test_issue1698(en_tokenizer, text):`
			`doc = en_tokenizer(text)`
			`assert len(doc) == 1`
			`assert not doc[0].like_url`


			`def test_issue1727():`
			`"""Test that models with no pretrained vectors can be deserialized`
			`correctly after vectors are added."""`
			`data = numpy.ones((3, 300), dtype='f')`
			`vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])`
			`tagger = Tagger(Vocab())`
			`tagger.add_label('PRP')`
			`tagger.begin_training()`
			`assert tagger.cfg.get('pretrained_dims', 0) == 0`
			`tagger.vocab.vectors = vectors`
			`with make_tempdir() as path:`
			`tagger.to_disk(path)`
			`tagger = Tagger(Vocab()).from_disk(path)`
			`assert tagger.cfg.get('pretrained_dims', 0) == 0`


			`def test_issue1757():`
			`"""Test comparison against None doesn't cause segfault."""`
			`doc = Doc(Vocab(), words=['a', 'b', 'c'])`
			`assert not doc[0] < None`
			`assert not doc[0] == None`
			`assert doc[0] >= None`
			`assert not doc[:2] < None`
			`assert not doc[:2] == None`
			`assert doc[:2] >= None`
			`assert not doc.vocab['a'] == None`
			`assert not doc.vocab['a'] < None`


			`def test_issue1758(en_tokenizer):`
			`"""Test that "would've" is handled by the English tokenizer exceptions."""`
			`tokens = en_tokenizer("would've")`
			`assert len(tokens) == 2`
			`assert tokens[0].tag_ == "MD"`
			`assert tokens[1].lemma_ == "have"`


			`def test_issue1799():`
			`"""Test sentence boundaries are deserialized correctly, even for`
			`non-projective sentences."""`
			`heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],`
			`[0, 8206900633647566924], [18446744073709551615, 440],`
			`[18446744073709551614, 442]], dtype='uint64')`
			`doc = Doc(Vocab(), words='Just what I was looking for .'.split())`
			`doc.vocab.strings.add('ROOT')`
			`doc = doc.from_array([HEAD, DEP], heads_deps)`
			`assert len(list(doc.sents)) == 1`


			`def test_issue1807():`
			`"""Test vocab.set_vector also adds the word to the vocab."""`
			`vocab = Vocab()`
			`assert 'hello' not in vocab`
			`vocab.set_vector('hello', numpy.ones((50,), dtype='f'))`
			`assert 'hello' in vocab`


			`def test_issue1834():`
			`"""Test that sentence boundaries & parse/tag flags are not lost`
			`during serialization."""`
			`string = "This is a first sentence . And another one"`
			`doc = Doc(Vocab(), words=string.split())`
			`doc[6].sent_start = True`
			`new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())`
			`assert new_doc[6].sent_start`
			`assert not new_doc.is_parsed`
			`assert not new_doc.is_tagged`
			`doc.is_parsed = True`
			`doc.is_tagged = True`
			`new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())`
			`assert new_doc.is_parsed`
			`assert new_doc.is_tagged`


			`def test_issue1868():`
			`"""Test Vocab.__contains__ works with int keys."""`
			`vocab = Vocab()`
			`lex = vocab['hello']`
			`assert lex.orth in vocab`
			`assert lex.orth_ in vocab`
			`assert 'some string' not in vocab`
			`int_id = vocab.strings.add('some string')`
			`assert int_id not in vocab`


			`def test_issue1883():`
			`matcher = Matcher(Vocab())`
			`matcher.add('pat1', None, [{'orth': 'hello'}])`
			`doc = Doc(matcher.vocab, words=['hello'])`
			`assert len(matcher(doc)) == 1`
			`new_matcher = copy.deepcopy(matcher)`
			`new_doc = Doc(new_matcher.vocab, words=['hello'])`
			`assert len(new_matcher(new_doc)) == 1`


			`@pytest.mark.parametrize('word', ['the'])`
			`def test_issue1889(word):`
			`assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)`


			`def test_issue1915():`
			`cfg = {'hidden_depth': 2} # should error out`
			`nlp = Language()`
			`nlp.add_pipe(nlp.create_pipe('ner'))`
			`nlp.get_pipe('ner').add_label('answer')`
			`with pytest.raises(ValueError):`
			`nlp.begin_training(**cfg)`


			`def test_issue1945():`
			`"""Test regression in Matcher introduced in v2.0.6."""`
			`matcher = Matcher(Vocab())`
			`matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])`
			`doc = Doc(matcher.vocab, words=['a', 'a', 'a'])`
			`matches = matcher(doc) # we should see two overlapping matches here`
			`assert len(matches) == 2`
			`assert matches[0][1:] == (0, 2)`
			`assert matches[1][1:] == (1, 3)`


			`@pytest.mark.parametrize('label', ['U-JOB-NAME'])`
			`def test_issue1967(label):`
			`ner = EntityRecognizer(Vocab())`
			`entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])`
			`gold_parses = [(None, [(entry, None)])]`
			`ner.moves.get_actions(gold_parses=gold_parses)`