Merge regression tests

This commit is contained in:
Ines Montani 2019-02-24 20:31:38 +01:00
parent 3bc53905cc
commit 328b589deb
8 changed files with 129 additions and 161 deletions

View File

@ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop
from spacy.vectors import Vectors
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.tokens import Doc, Span
from spacy.tokens import Doc, Span, Token
from spacy.pipeline import Tagger, EntityRecognizer
from spacy.attrs import HEAD, DEP
from spacy.matcher import Matcher
@ -272,3 +272,60 @@ def test_issue1967(label):
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
gold_parses = [(None, [(entry, None)])]
ner.moves.get_actions(gold_parses=gold_parses)
def test_issue1971(en_vocab):
# Possibly related to #2675 and #2671?
matcher = Matcher(en_vocab)
pattern = [
{"ORTH": "Doe"},
{"ORTH": "!", "OP": "?"},
{"_": {"optional": True}, "OP": "?"},
{"ORTH": "!", "OP": "?"},
]
Token.set_extension("optional", default=False)
matcher.add("TEST", None, pattern)
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
# We could also assert length 1 here, but this is more conclusive, because
# the real problem here is that it returns a duplicate match for a match_id
# that's not actually in the vocab!
matches = matcher(doc)
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
def test_issue_1971_2(en_vocab):
matcher = Matcher(en_vocab)
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST1", None, pattern1, pattern2)
matches = matcher(doc)
assert len(matches) == 2
def test_issue_1971_3(en_vocab):
"""Test that pattern matches correctly for multiple extension attributes."""
Token.set_extension("a", default=1, force=True)
Token.set_extension("b", default=2, force=True)
doc = Doc(en_vocab, words=["hello", "world"])
matcher = Matcher(en_vocab)
matcher.add("A", None, [{"_": {"a": 1}}])
matcher.add("B", None, [{"_": {"b": 2}}])
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
assert len(matches) == 4
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
def test_issue_1971_4(en_vocab):
"""Test that pattern matches correctly with multiple extension attribute
values on a single token.
"""
Token.set_extension("ext_a", default="str_a", force=True)
Token.set_extension("ext_b", default="str_b", force=True)
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=["this", "is", "text"])
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
matcher.add("TEST", None, pattern)
matches = matcher(doc)
# Uncommenting this caused a segmentation fault
assert len(matches) == 1

View File

@ -1,62 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.matcher import Matcher
from spacy.tokens import Token, Doc
def test_issue1971(en_vocab):
# Possibly related to #2675 and #2671?
matcher = Matcher(en_vocab)
pattern = [
{"ORTH": "Doe"},
{"ORTH": "!", "OP": "?"},
{"_": {"optional": True}, "OP": "?"},
{"ORTH": "!", "OP": "?"},
]
Token.set_extension("optional", default=False)
matcher.add("TEST", None, pattern)
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
# We could also assert length 1 here, but this is more conclusive, because
# the real problem here is that it returns a duplicate match for a match_id
# that's not actually in the vocab!
matches = matcher(doc)
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
def test_issue_1971_2(en_vocab):
matcher = Matcher(en_vocab)
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
matcher.add("TEST1", None, pattern1, pattern2)
matches = matcher(doc)
assert len(matches) == 2
def test_issue_1971_3(en_vocab):
"""Test that pattern matches correctly for multiple extension attributes."""
Token.set_extension("a", default=1, force=True)
Token.set_extension("b", default=2, force=True)
doc = Doc(en_vocab, words=["hello", "world"])
matcher = Matcher(en_vocab)
matcher.add("A", None, [{"_": {"a": 1}}])
matcher.add("B", None, [{"_": {"b": 2}}])
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
assert len(matches) == 4
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
def test_issue_1971_4(en_vocab):
"""Test that pattern matches correctly with multiple extension attribute
values on a single token.
"""
Token.set_extension("ext_a", default="str_a", force=True)
Token.set_extension("ext_b", default="str_b", force=True)
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=["this", "is", "text"])
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
matcher.add("TEST", None, pattern)
matches = matcher(doc)
# Uncommenting this caused a segmentation fault
assert len(matches) == 1

View File

@ -2,13 +2,15 @@
from __future__ import unicode_literals
import pytest
from spacy import displacy
from spacy.lang.en import English
from spacy.lang.ja import Japanese
from spacy.lang.xx import MultiLanguage
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.compat import pickle
from spacy._ml import link_vectors_to_models
import numpy
@ -54,6 +56,25 @@ def test_issue2626_2835(en_tokenizer, text):
assert doc
def test_issue2656(en_tokenizer):
"""Test that tokenizer correctly splits of punctuation after numbers with
decimal points.
"""
doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
assert len(doc) == 11
assert doc[0].text == "I"
assert doc[1].text == "went"
assert doc[2].text == "for"
assert doc[3].text == "40.3"
assert doc[4].text == ","
assert doc[5].text == "and"
assert doc[6].text == "got"
assert doc[7].text == "home"
assert doc[8].text == "by"
assert doc[9].text == "10.0"
assert doc[10].text == "."
def test_issue2671():
"""Ensure the correct entity ID is returned for matches with quantifiers.
See also #2675
@ -77,6 +98,17 @@ def test_issue2671():
assert nlp.vocab.strings[match_id] == pattern_id
def test_issue2728(en_vocab):
"""Test that displaCy ENT visualizer escapes HTML correctly."""
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
doc.ents = [Span(doc, 0, 1, label="TEST")]
html = displacy.render(doc, style="ent")
assert "&lt;RELEASE&gt;" in html
doc.ents = [Span(doc, 1, 2, label="TEST")]
html = displacy.render(doc, style="ent")
assert "&lt;RELEASE&gt;" in html
def test_issue2754(en_tokenizer):
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
a = en_tokenizer("a")
@ -106,6 +138,28 @@ def test_issue2782(text, lang_cls):
assert doc[0].like_num
def test_issue2822(it_tokenizer):
"""Test that the abbreviation of poco is kept as one word."""
doc = it_tokenizer("Vuoi un po' di zucchero?")
assert len(doc) == 6
assert doc[0].text == "Vuoi"
assert doc[1].text == "un"
assert doc[2].text == "po'"
assert doc[2].lemma_ == "poco"
assert doc[3].text == "di"
assert doc[4].text == "zucchero"
assert doc[5].text == "?"
def test_issue2833(en_vocab):
"""Test that a custom error is raised if a token or span is pickled."""
doc = Doc(en_vocab, words=["Hello", "world"])
with pytest.raises(NotImplementedError):
pickle.dumps(doc[0])
with pytest.raises(NotImplementedError):
pickle.dumps(doc[0:2])
def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"]
@ -134,3 +188,19 @@ def test_issue2901():
doc = nlp("pythonが大好きです")
assert doc
def test_issue2926(fr_tokenizer):
"""Test that the tokenizer correctly splits tokens separated by a slash (/)
ending in a digit.
"""
doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
assert len(doc) == 8
assert doc[0].text == "Learn"
assert doc[1].text == "html5"
assert doc[2].text == "/"
assert doc[3].text == "css3"
assert doc[4].text == "/"
assert doc[5].text == "javascript"
assert doc[6].text == "/"
assert doc[7].text == "jquery"

View File

@ -1,24 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English
def test_issue2656():
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
text = "I went for 40.3, and got home by 10.0."
nlp = English()
doc = nlp(text)
assert len(doc) == 11
assert doc[0].text == "I"
assert doc[1].text == "went"
assert doc[2].text == "for"
assert doc[3].text == "40.3"
assert doc[4].text == ","
assert doc[5].text == "and"
assert doc[6].text == "got"
assert doc[7].text == "home"
assert doc[8].text == "by"
assert doc[9].text == "10.0"
assert doc[10].text == "."

View File

@ -1,16 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy import displacy
from spacy.tokens import Doc, Span
def test_issue2728(en_vocab):
"""Test that displaCy ENT visualizer escapes HTML correctly."""
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
doc.ents = [Span(doc, 0, 1, label="TEST")]
html = displacy.render(doc, style="ent")
assert "&lt;RELEASE&gt;" in html
doc.ents = [Span(doc, 1, 2, label="TEST")]
html = displacy.render(doc, style="ent")
assert "&lt;RELEASE&gt;" in html

View File

@ -1,21 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.it import Italian
def test_issue2822():
""" Test that the abbreviation of poco is kept as one word """
nlp = Italian()
text = "Vuoi un po' di zucchero?"
doc = nlp(text)
assert len(doc) == 6
assert doc[0].text == "Vuoi"
assert doc[1].text == "un"
assert doc[2].text == "po'"
assert doc[2].lemma_ == "poco"
assert doc[3].text == "di"
assert doc[4].text == "zucchero"
assert doc[5].text == "?"

View File

@ -1,15 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.tokens import Doc
from spacy.compat import pickle
def test_issue2833(en_vocab):
"""Test that a custom error is raised if a token or span is pickled."""
doc = Doc(en_vocab, words=["Hello", "world"])
with pytest.raises(NotImplementedError):
pickle.dumps(doc[0])
with pytest.raises(NotImplementedError):
pickle.dumps(doc[0:2])

View File

@ -1,21 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.fr import French
def test_issue2926():
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
nlp = French()
text = "Learn html5/css3/javascript/jquery"
doc = nlp(text)
assert len(doc) == 8
assert doc[0].text == "Learn"
assert doc[1].text == "html5"
assert doc[2].text == "/"
assert doc[3].text == "css3"
assert doc[4].text == "/"
assert doc[5].text == "javascript"
assert doc[6].text == "/"
assert doc[7].text == "jquery"