mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
75f3234404
## Description Related issues: #2379 (should be fixed by separating model tests) * **total execution time down from > 300 seconds to under 60 seconds** 🎉 * removed all model-specific tests that could only really be run manually anyway – those will now live in a separate test suite in the [`spacy-models`](https://github.com/explosion/spacy-models) repository and are already integrated into our new model training infrastructure * changed all relative imports to absolute imports to prepare for moving the test suite from `/spacy/tests` to `/tests` (it'll now always test against the installed version) * merged old regression tests into collections, e.g. `test_issue1001-1500.py` (about 90% of the regression tests are very short anyways) * tidied up and rewrote existing tests wherever possible ### Todo - [ ] move tests to `/tests` and adjust CI commands accordingly - [x] move model test suite from internal repo to `spacy-models` - [x] ~~investigate why `pipeline/test_textcat.py` is flakey~~ - [x] review old regression tests (leftover files) and see if they can be merged, simplified or deleted - [ ] update documentation on how to run tests ### Types of change enhancement, tests ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
421 lines
14 KiB
Python
421 lines
14 KiB
Python
# coding: utf-8
|
||
from __future__ import unicode_literals
|
||
|
||
import pytest
|
||
import random
|
||
from spacy.matcher import Matcher
|
||
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
||
from spacy.symbols import POS, VERB, VerbForm_inf
|
||
from spacy.vocab import Vocab
|
||
from spacy.language import Language
|
||
from spacy.lemmatizer import Lemmatizer
|
||
from spacy.tokens import Doc
|
||
|
||
from ..util import get_doc, make_tempdir
|
||
|
||
|
||
@pytest.mark.parametrize('patterns', [
|
||
[[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
|
||
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]])
|
||
def test_issue118(en_tokenizer, patterns):
|
||
"""Test a bug that arose from having overlapping matches"""
|
||
text = "how many points did lebron james score against the boston celtics last night"
|
||
doc = en_tokenizer(text)
|
||
ORG = doc.vocab.strings['ORG']
|
||
matcher = Matcher(doc.vocab)
|
||
matcher.add("BostonCeltics", None, *patterns)
|
||
assert len(list(doc.ents)) == 0
|
||
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
|
||
doc.ents = matches[:1]
|
||
ents = list(doc.ents)
|
||
assert len(ents) == 1
|
||
assert ents[0].label == ORG
|
||
assert ents[0].start == 9
|
||
assert ents[0].end == 11
|
||
|
||
|
||
@pytest.mark.parametrize('patterns', [
|
||
[[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
|
||
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]])
|
||
def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
||
"""Test a bug that arose from having overlapping matches"""
|
||
text = "how many points did lebron james score against the boston celtics last night"
|
||
doc = en_tokenizer(text)
|
||
ORG = doc.vocab.strings['ORG']
|
||
matcher = Matcher(doc.vocab)
|
||
matcher.add('BostonCeltics', None, *patterns)
|
||
assert len(list(doc.ents)) == 0
|
||
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||
doc.ents += tuple(matches)[1:]
|
||
assert matches == [(ORG, 9, 10), (ORG, 9, 11)]
|
||
ents = doc.ents
|
||
assert len(ents) == 1
|
||
assert ents[0].label == ORG
|
||
assert ents[0].start == 9
|
||
assert ents[0].end == 11
|
||
|
||
|
||
def test_issue242(en_tokenizer):
|
||
"""Test overlapping multi-word phrases."""
|
||
text = "There are different food safety standards in different countries."
|
||
patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
|
||
[{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
|
||
doc = en_tokenizer(text)
|
||
matcher = Matcher(doc.vocab)
|
||
matcher.add('FOOD', None, *patterns)
|
||
|
||
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
||
doc.ents += tuple(matches)
|
||
match1, match2 = matches
|
||
assert match1[1] == 3
|
||
assert match1[2] == 5
|
||
assert match2[1] == 4
|
||
assert match2[2] == 6
|
||
|
||
|
||
def test_issue309(en_tokenizer):
|
||
"""Test Issue #309: SBD fails on empty string"""
|
||
tokens = en_tokenizer(" ")
|
||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
|
||
doc.is_parsed = True
|
||
assert len(doc) == 1
|
||
sents = list(doc.sents)
|
||
assert len(sents) == 1
|
||
|
||
|
||
def test_issue351(en_tokenizer):
|
||
doc = en_tokenizer(" This is a cat.")
|
||
assert doc[0].idx == 0
|
||
assert len(doc[0]) == 3
|
||
assert doc[1].idx == 3
|
||
|
||
|
||
def test_issue360(en_tokenizer):
|
||
"""Test tokenization of big ellipsis"""
|
||
tokens = en_tokenizer('$45...............Asking')
|
||
assert len(tokens) > 2
|
||
|
||
|
||
@pytest.mark.parametrize('text1,text2', [("cat", "dog")])
|
||
def test_issue361(en_vocab, text1, text2):
|
||
"""Test Issue #361: Equality of lexemes"""
|
||
assert en_vocab[text1] == en_vocab[text1]
|
||
assert en_vocab[text1] != en_vocab[text2]
|
||
|
||
|
||
def test_issue587(en_tokenizer):
|
||
"""Test that Matcher doesn't segfault on particular input"""
|
||
doc = en_tokenizer('a b; c')
|
||
matcher = Matcher(doc.vocab)
|
||
matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
|
||
matches = matcher(doc)
|
||
assert len(matches) == 1
|
||
matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
|
||
matches = matcher(doc)
|
||
assert len(matches) == 2
|
||
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
|
||
matches = matcher(doc)
|
||
assert len(matches) == 2
|
||
|
||
|
||
def test_issue588(en_vocab):
|
||
matcher = Matcher(en_vocab)
|
||
with pytest.raises(ValueError):
|
||
matcher.add('TEST', None, [])
|
||
|
||
|
||
@pytest.mark.xfail
|
||
def test_issue589():
|
||
vocab = Vocab()
|
||
vocab.strings.set_frozen(True)
|
||
doc = Doc(vocab, words=['whata'])
|
||
|
||
|
||
def test_issue590(en_vocab):
|
||
"""Test overlapping matches"""
|
||
doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
|
||
matcher = Matcher(en_vocab)
|
||
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
|
||
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
|
||
matches = matcher(doc)
|
||
assert len(matches) == 2
|
||
|
||
|
||
def test_issue595():
|
||
"""Test lemmatization of base forms"""
|
||
words = ["Do", "n't", "feed", "the", "dog"]
|
||
tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
|
||
rules = {"verb": [["ed", "e"]]}
|
||
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||
doc = Doc(vocab, words=words)
|
||
doc[2].tag_ = 'VB'
|
||
assert doc[2].text == 'feed'
|
||
assert doc[2].lemma_ == 'feed'
|
||
|
||
|
||
def test_issue599(en_vocab):
|
||
doc = Doc(en_vocab)
|
||
doc.is_tagged = True
|
||
doc.is_parsed = True
|
||
doc2 = Doc(doc.vocab)
|
||
doc2.from_bytes(doc.to_bytes())
|
||
assert doc2.is_parsed
|
||
|
||
|
||
def test_issue600():
|
||
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
|
||
doc = Doc(vocab, words=["hello"])
|
||
doc[0].tag_ = 'NN'
|
||
|
||
|
||
def test_issue615(en_tokenizer):
|
||
def merge_phrases(matcher, doc, i, matches):
|
||
"""Merge a phrase. We have to be careful here because we'll change the
|
||
token indices. To avoid problems, merge all the phrases once we're called
|
||
on the last match."""
|
||
if i != len(matches)-1:
|
||
return None
|
||
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
||
for ent_id, label, span in spans:
|
||
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
|
||
label=label)
|
||
doc.ents = doc.ents + ((label, span.start, span.end),)
|
||
|
||
text = "The golf club is broken"
|
||
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
||
label = "Sport_Equipment"
|
||
doc = en_tokenizer(text)
|
||
matcher = Matcher(doc.vocab)
|
||
matcher.add(label, merge_phrases, pattern)
|
||
match = matcher(doc)
|
||
entities = list(doc.ents)
|
||
assert entities != []
|
||
assert entities[0].label != 0
|
||
|
||
|
||
@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")])
|
||
def test_issue736(en_tokenizer, text, number):
|
||
"""Test that times like "7am" are tokenized correctly and that numbers are
|
||
converted to string."""
|
||
tokens = en_tokenizer(text)
|
||
assert len(tokens) == 2
|
||
assert tokens[0].text == number
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"])
|
||
def test_issue740(en_tokenizer, text):
|
||
"""Test that dates are not split and kept as one token. This behaviour is
|
||
currently inconsistent, since dates separated by hyphens are still split.
|
||
This will be hard to prevent without causing clashes with numeric ranges."""
|
||
tokens = en_tokenizer(text)
|
||
assert len(tokens) == 1
|
||
|
||
|
||
def test_issue743():
|
||
doc = Doc(Vocab(), ['hello', 'world'])
|
||
token = doc[0]
|
||
s = set([token])
|
||
items = list(s)
|
||
assert items[0] is token
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"])
|
||
def test_issue744(en_tokenizer, text):
|
||
"""Test that 'were' and 'Were' are excluded from the contractions
|
||
generated by the English tokenizer exceptions."""
|
||
tokens = en_tokenizer(text)
|
||
assert len(tokens) == 3
|
||
assert tokens[1].text.lower() == "were"
|
||
|
||
|
||
@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
|
||
("teneleven", False)])
|
||
def test_issue759(en_tokenizer, text, is_num):
|
||
tokens = en_tokenizer(text)
|
||
assert tokens[0].like_num == is_num
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"])
|
||
def test_issue775(en_tokenizer, text):
|
||
"""Test that 'Shell' and 'shell' are excluded from the contractions
|
||
generated by the English tokenizer exceptions."""
|
||
tokens = en_tokenizer(text)
|
||
assert len(tokens) == 1
|
||
assert tokens[0].text == text
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||
def test_issue792(en_tokenizer, text):
|
||
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||
doc = en_tokenizer(text)
|
||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||
def test_control_issue792(en_tokenizer, text):
|
||
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||
doc = en_tokenizer(text)
|
||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||
|
||
|
||
@pytest.mark.parametrize('text,tokens', [
|
||
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
||
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
||
("day.--Is", ["day", ".--", "Is"]),
|
||
("refinement:--just", ["refinement", ":--", "just"]),
|
||
("memories?--To", ["memories", "?--", "To"]),
|
||
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
||
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
|
||
def test_issue801(en_tokenizer, text, tokens):
|
||
"""Test that special characters + hyphens are split correctly."""
|
||
doc = en_tokenizer(text)
|
||
assert len(doc) == len(tokens)
|
||
assert [t.text for t in doc] == tokens
|
||
|
||
|
||
@pytest.mark.parametrize('text,expected_tokens', [
|
||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
||
])
|
||
def test_issue805(sv_tokenizer, text, expected_tokens):
|
||
tokens = sv_tokenizer(text)
|
||
token_list = [token.text for token in tokens if not token.is_space]
|
||
assert expected_tokens == token_list
|
||
|
||
|
||
def test_issue850():
|
||
"""The variable-length pattern matches the succeeding token. Check we
|
||
handle the ambiguity correctly."""
|
||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||
matcher = Matcher(vocab)
|
||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||
pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}]
|
||
matcher.add('FarAway', None, pattern)
|
||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||
match = matcher(doc)
|
||
assert len(match) == 1
|
||
ent_id, start, end = match[0]
|
||
assert start == 0
|
||
assert end == 4
|
||
|
||
|
||
def test_issue850_basic():
|
||
"""Test Matcher matches with '*' operator and Boolean flag"""
|
||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||
matcher = Matcher(vocab)
|
||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||
pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}]
|
||
matcher.add('FarAway', None, pattern)
|
||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
||
match = matcher(doc)
|
||
assert len(match) == 1
|
||
ent_id, start, end = match[0]
|
||
assert start == 0
|
||
assert end == 4
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes",
|
||
"terra-formées", "σ-compacts"])
|
||
def test_issue852(fr_tokenizer, text):
|
||
"""Test that French tokenizer exceptions are imported correctly."""
|
||
tokens = fr_tokenizer(text)
|
||
assert len(tokens) == 1
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
|
||
"aaabbb@ccc.com \nThank you!"])
|
||
def test_issue859(en_tokenizer, text):
|
||
"""Test that no extra space is added in doc.text method."""
|
||
doc = en_tokenizer(text)
|
||
assert doc.text == text
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
|
||
def test_issue886(en_tokenizer, text):
|
||
"""Test that token.idx matches the original text index for texts with newlines."""
|
||
doc = en_tokenizer(text)
|
||
for token in doc:
|
||
assert len(token.text) == len(token.text_with_ws)
|
||
assert text[token.idx] == token.text[0]
|
||
|
||
|
||
@pytest.mark.parametrize('text', ["want/need"])
|
||
def test_issue891(en_tokenizer, text):
|
||
"""Test that / infixes are split correctly."""
|
||
tokens = en_tokenizer(text)
|
||
assert len(tokens) == 3
|
||
assert tokens[1].text == "/"
|
||
|
||
|
||
@pytest.mark.parametrize('text,tag,lemma', [
|
||
("anus", "NN", "anus"),
|
||
("princess", "NN", "princess"),
|
||
("inner", "JJ", "inner")
|
||
])
|
||
def test_issue912(en_vocab, text, tag, lemma):
|
||
"""Test base-forms are preserved."""
|
||
doc = Doc(en_vocab, words=[text])
|
||
doc[0].tag_ = tag
|
||
assert doc[0].lemma_ == lemma
|
||
|
||
|
||
def test_issue957(en_tokenizer):
|
||
"""Test that spaCy doesn't hang on many periods."""
|
||
# skip test if pytest-timeout is not installed
|
||
timeout = pytest.importorskip('pytest-timeout')
|
||
string = '0'
|
||
for i in range(1, 100):
|
||
string += '.%d' % i
|
||
doc = en_tokenizer(string)
|
||
|
||
|
||
@pytest.mark.xfail
|
||
def test_issue999(train_data):
|
||
"""Test that adding entities and resuming training works passably OK.
|
||
There are two issues here:
|
||
1) We have to readd labels. This isn't very nice.
|
||
2) There's no way to set the learning rate for the weight update, so we
|
||
end up out-of-scale, causing it to learn too fast.
|
||
"""
|
||
TRAIN_DATA = [
|
||
["hey", []],
|
||
["howdy", []],
|
||
["hey there", []],
|
||
["hello", []],
|
||
["hi", []],
|
||
["i'm looking for a place to eat", []],
|
||
["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]],
|
||
["show me chinese restaurants", [[8,15,"CUISINE"]]],
|
||
["show me chines restaurants", [[8,14,"CUISINE"]]],
|
||
]
|
||
|
||
nlp = Language()
|
||
ner = nlp.create_pipe('ner')
|
||
nlp.add_pipe(ner)
|
||
for _, offsets in TRAIN_DATA:
|
||
for start, end, label in offsets:
|
||
ner.add_label(label)
|
||
nlp.begin_training()
|
||
ner.model.learn_rate = 0.001
|
||
for itn in range(100):
|
||
random.shuffle(TRAIN_DATA)
|
||
for raw_text, entity_offsets in TRAIN_DATA:
|
||
nlp.update([raw_text], [{'entities': entity_offsets}])
|
||
|
||
with make_tempdir() as model_dir:
|
||
nlp.to_disk(model_dir)
|
||
nlp2 = Language().from_disk(model_dir)
|
||
|
||
for raw_text, entity_offsets in TRAIN_DATA:
|
||
doc = nlp2(raw_text)
|
||
ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
|
||
for start, end, label in entity_offsets:
|
||
if (start, end) in ents:
|
||
assert ents[(start, end)] == label
|
||
break
|
||
else:
|
||
if entity_offsets:
|
||
raise Exception(ents)
|