mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge regression tests
This commit is contained in:
parent
3bc53905cc
commit
328b589deb
|
@ -11,7 +11,7 @@ from spacy.lang.lex_attrs import is_stop
|
|||
from spacy.vectors import Vectors
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.tokens import Doc, Span, Token
|
||||
from spacy.pipeline import Tagger, EntityRecognizer
|
||||
from spacy.attrs import HEAD, DEP
|
||||
from spacy.matcher import Matcher
|
||||
|
@ -272,3 +272,60 @@ def test_issue1967(label):
|
|||
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
||||
gold_parses = [(None, [(entry, None)])]
|
||||
ner.moves.get_actions(gold_parses=gold_parses)
|
||||
|
||||
|
||||
def test_issue1971(en_vocab):
|
||||
# Possibly related to #2675 and #2671?
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"ORTH": "Doe"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
{"_": {"optional": True}, "OP": "?"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
]
|
||||
Token.set_extension("optional", default=False)
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
||||
# We could also assert length 1 here, but this is more conclusive, because
|
||||
# the real problem here is that it returns a duplicate match for a match_id
|
||||
# that's not actually in the vocab!
|
||||
matches = matcher(doc)
|
||||
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
|
||||
|
||||
|
||||
def test_issue_1971_2(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
||||
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
||||
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
||||
matcher.add("TEST1", None, pattern1, pattern2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
||||
def test_issue_1971_3(en_vocab):
|
||||
"""Test that pattern matches correctly for multiple extension attributes."""
|
||||
Token.set_extension("a", default=1, force=True)
|
||||
Token.set_extension("b", default=2, force=True)
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("A", None, [{"_": {"a": 1}}])
|
||||
matcher.add("B", None, [{"_": {"b": 2}}])
|
||||
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
||||
assert len(matches) == 4
|
||||
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
||||
|
||||
|
||||
def test_issue_1971_4(en_vocab):
|
||||
"""Test that pattern matches correctly with multiple extension attribute
|
||||
values on a single token.
|
||||
"""
|
||||
Token.set_extension("ext_a", default="str_a", force=True)
|
||||
Token.set_extension("ext_b", default="str_b", force=True)
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=["this", "is", "text"])
|
||||
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
||||
matcher.add("TEST", None, pattern)
|
||||
matches = matcher(doc)
|
||||
# Uncommenting this caused a segmentation fault
|
||||
assert len(matches) == 1
|
||||
|
|
|
@ -1,62 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Token, Doc
|
||||
|
||||
|
||||
def test_issue1971(en_vocab):
|
||||
# Possibly related to #2675 and #2671?
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"ORTH": "Doe"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
{"_": {"optional": True}, "OP": "?"},
|
||||
{"ORTH": "!", "OP": "?"},
|
||||
]
|
||||
Token.set_extension("optional", default=False)
|
||||
matcher.add("TEST", None, pattern)
|
||||
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
|
||||
# We could also assert length 1 here, but this is more conclusive, because
|
||||
# the real problem here is that it returns a duplicate match for a match_id
|
||||
# that's not actually in the vocab!
|
||||
matches = matcher(doc)
|
||||
assert all([match_id in en_vocab.strings for match_id, start, end in matches])
|
||||
|
||||
|
||||
def test_issue_1971_2(en_vocab):
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
|
||||
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
|
||||
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
|
||||
matcher.add("TEST1", None, pattern1, pattern2)
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 2
|
||||
|
||||
|
||||
def test_issue_1971_3(en_vocab):
|
||||
"""Test that pattern matches correctly for multiple extension attributes."""
|
||||
Token.set_extension("a", default=1, force=True)
|
||||
Token.set_extension("b", default=2, force=True)
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("A", None, [{"_": {"a": 1}}])
|
||||
matcher.add("B", None, [{"_": {"b": 2}}])
|
||||
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
|
||||
assert len(matches) == 4
|
||||
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
|
||||
|
||||
|
||||
def test_issue_1971_4(en_vocab):
|
||||
"""Test that pattern matches correctly with multiple extension attribute
|
||||
values on a single token.
|
||||
"""
|
||||
Token.set_extension("ext_a", default="str_a", force=True)
|
||||
Token.set_extension("ext_b", default="str_b", force=True)
|
||||
matcher = Matcher(en_vocab)
|
||||
doc = Doc(en_vocab, words=["this", "is", "text"])
|
||||
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
|
||||
matcher.add("TEST", None, pattern)
|
||||
matches = matcher(doc)
|
||||
# Uncommenting this caused a segmentation fault
|
||||
assert len(matches) == 1
|
|
@ -2,13 +2,15 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy import displacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.ja import Japanese
|
||||
from spacy.lang.xx import MultiLanguage
|
||||
from spacy.language import Language
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Span
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.compat import pickle
|
||||
from spacy._ml import link_vectors_to_models
|
||||
import numpy
|
||||
|
||||
|
@ -54,6 +56,25 @@ def test_issue2626_2835(en_tokenizer, text):
|
|||
assert doc
|
||||
|
||||
|
||||
def test_issue2656(en_tokenizer):
|
||||
"""Test that tokenizer correctly splits of punctuation after numbers with
|
||||
decimal points.
|
||||
"""
|
||||
doc = en_tokenizer("I went for 40.3, and got home by 10.0.")
|
||||
assert len(doc) == 11
|
||||
assert doc[0].text == "I"
|
||||
assert doc[1].text == "went"
|
||||
assert doc[2].text == "for"
|
||||
assert doc[3].text == "40.3"
|
||||
assert doc[4].text == ","
|
||||
assert doc[5].text == "and"
|
||||
assert doc[6].text == "got"
|
||||
assert doc[7].text == "home"
|
||||
assert doc[8].text == "by"
|
||||
assert doc[9].text == "10.0"
|
||||
assert doc[10].text == "."
|
||||
|
||||
|
||||
def test_issue2671():
|
||||
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
||||
See also #2675
|
||||
|
@ -77,6 +98,17 @@ def test_issue2671():
|
|||
assert nlp.vocab.strings[match_id] == pattern_id
|
||||
|
||||
|
||||
def test_issue2728(en_vocab):
|
||||
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
||||
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
||||
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
|
||||
|
||||
def test_issue2754(en_tokenizer):
|
||||
"""Test that words like 'a' and 'a.m.' don't get exceptional norm values."""
|
||||
a = en_tokenizer("a")
|
||||
|
@ -106,6 +138,28 @@ def test_issue2782(text, lang_cls):
|
|||
assert doc[0].like_num
|
||||
|
||||
|
||||
def test_issue2822(it_tokenizer):
|
||||
"""Test that the abbreviation of poco is kept as one word."""
|
||||
doc = it_tokenizer("Vuoi un po' di zucchero?")
|
||||
assert len(doc) == 6
|
||||
assert doc[0].text == "Vuoi"
|
||||
assert doc[1].text == "un"
|
||||
assert doc[2].text == "po'"
|
||||
assert doc[2].lemma_ == "poco"
|
||||
assert doc[3].text == "di"
|
||||
assert doc[4].text == "zucchero"
|
||||
assert doc[5].text == "?"
|
||||
|
||||
|
||||
def test_issue2833(en_vocab):
|
||||
"""Test that a custom error is raised if a token or span is pickled."""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0:2])
|
||||
|
||||
|
||||
def test_issue2871():
|
||||
"""Test that vectors recover the correct key for spaCy reserved words."""
|
||||
words = ["dog", "cat", "SUFFIX"]
|
||||
|
@ -134,3 +188,19 @@ def test_issue2901():
|
|||
|
||||
doc = nlp("pythonが大好きです")
|
||||
assert doc
|
||||
|
||||
|
||||
def test_issue2926(fr_tokenizer):
|
||||
"""Test that the tokenizer correctly splits tokens separated by a slash (/)
|
||||
ending in a digit.
|
||||
"""
|
||||
doc = fr_tokenizer("Learn html5/css3/javascript/jquery")
|
||||
assert len(doc) == 8
|
||||
assert doc[0].text == "Learn"
|
||||
assert doc[1].text == "html5"
|
||||
assert doc[2].text == "/"
|
||||
assert doc[3].text == "css3"
|
||||
assert doc[4].text == "/"
|
||||
assert doc[5].text == "javascript"
|
||||
assert doc[6].text == "/"
|
||||
assert doc[7].text == "jquery"
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
def test_issue2656():
|
||||
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
|
||||
text = "I went for 40.3, and got home by 10.0."
|
||||
nlp = English()
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc) == 11
|
||||
|
||||
assert doc[0].text == "I"
|
||||
assert doc[1].text == "went"
|
||||
assert doc[2].text == "for"
|
||||
assert doc[3].text == "40.3"
|
||||
assert doc[4].text == ","
|
||||
assert doc[5].text == "and"
|
||||
assert doc[6].text == "got"
|
||||
assert doc[7].text == "home"
|
||||
assert doc[8].text == "by"
|
||||
assert doc[9].text == "10.0"
|
||||
assert doc[10].text == "."
|
|
@ -1,16 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy import displacy
|
||||
from spacy.tokens import Doc, Span
|
||||
|
||||
|
||||
def test_issue2728(en_vocab):
|
||||
"""Test that displaCy ENT visualizer escapes HTML correctly."""
|
||||
doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
|
||||
doc.ents = [Span(doc, 0, 1, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
||||
doc.ents = [Span(doc, 1, 2, label="TEST")]
|
||||
html = displacy.render(doc, style="ent")
|
||||
assert "<RELEASE>" in html
|
|
@ -1,21 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.it import Italian
|
||||
|
||||
|
||||
def test_issue2822():
|
||||
""" Test that the abbreviation of poco is kept as one word """
|
||||
nlp = Italian()
|
||||
text = "Vuoi un po' di zucchero?"
|
||||
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc) == 6
|
||||
|
||||
assert doc[0].text == "Vuoi"
|
||||
assert doc[1].text == "un"
|
||||
assert doc[2].text == "po'"
|
||||
assert doc[2].lemma_ == "poco"
|
||||
assert doc[3].text == "di"
|
||||
assert doc[4].text == "zucchero"
|
||||
assert doc[5].text == "?"
|
|
@ -1,15 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.tokens import Doc
|
||||
from spacy.compat import pickle
|
||||
|
||||
|
||||
def test_issue2833(en_vocab):
|
||||
"""Test that a custom error is raised if a token or span is pickled."""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0])
|
||||
with pytest.raises(NotImplementedError):
|
||||
pickle.dumps(doc[0:2])
|
|
@ -1,21 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.fr import French
|
||||
|
||||
|
||||
def test_issue2926():
|
||||
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
|
||||
nlp = French()
|
||||
text = "Learn html5/css3/javascript/jquery"
|
||||
doc = nlp(text)
|
||||
|
||||
assert len(doc) == 8
|
||||
|
||||
assert doc[0].text == "Learn"
|
||||
assert doc[1].text == "html5"
|
||||
assert doc[2].text == "/"
|
||||
assert doc[3].text == "css3"
|
||||
assert doc[4].text == "/"
|
||||
assert doc[5].text == "javascript"
|
||||
assert doc[6].text == "/"
|
||||
assert doc[7].text == "jquery"
|
Loading…
Reference in New Issue
Block a user