mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Test suite clean up (#5781)
* step_through tests: skip instead of xfail * test_empty_doc should be fixed with new Thinc version * remove outdated test (there are other misaligned tests now) * xfail reason * fix test according to french exceptions * clarified some skipped tests * skip ukranian test instead of xfail * skip instead of xfail * skip + reason instead of xfail * removed obsolete tests referring to removed "set_frozen" functionality * fix test 999 * remove unused AlignmentError * remove xfail where possible, skip otherwise * increment thinc release for empty_doc test
This commit is contained in:
parent
1b2ec94382
commit
c9da9605f7
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a17,<8.0.0a20",
|
"thinc>=8.0.0a18,<8.0.0a20",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations"
|
"pytokenizations"
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a17,<8.0.0a20
|
thinc>=8.0.0a18,<8.0.0a20
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets>=0.1.1
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a17,<8.0.0a20
|
thinc>=8.0.0a18,<8.0.0a20
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a17,<8.0.0a20
|
thinc>=8.0.0a18,<8.0.0a20
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.7.0,<1.1.0
|
wasabi>=0.7.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -568,7 +568,3 @@ class MatchPatternError(ValueError):
|
||||||
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
pattern_errors = "\n".join([f"- {e}" for e in error_msgs])
|
||||||
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n"
|
||||||
ValueError.__init__(self, msg)
|
ValueError.__init__(self, msg)
|
||||||
|
|
||||||
|
|
||||||
class AlignmentError(ValueError):
|
|
||||||
pass
|
|
||||||
|
|
|
@ -119,9 +119,8 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
|
||||||
assert tokens[4].text == "Mr."
|
assert tokens[4].text == "Mr."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail(reason="Issue #225 - not yet implemented")
|
||||||
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||||
# Re Issue #225
|
|
||||||
tokens = en_tokenizer(
|
tokens = en_tokenizer(
|
||||||
"""Will this road take me to Puddleton?\u2014No, """
|
"""Will this road take me to Puddleton?\u2014No, """
|
||||||
"""you'll have to walk there.\u2014Ariel."""
|
"""you'll have to walk there.\u2014Ariel."""
|
||||||
|
|
|
@ -14,7 +14,7 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
text = "This is a sentence . This is another one ."
|
text = "This is a sentence . This is another one ."
|
||||||
|
|
|
@ -81,13 +81,14 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
|
||||||
assert tokens[2].lemma_ == "ce"
|
assert tokens[2].lemma_ == "ce"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
||||||
text = "Est-ce pas génial?"
|
text = "Est-ce pas génial?"
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 6
|
assert len(tokens) == 5
|
||||||
assert tokens[0].text == "Est"
|
assert tokens[0].text == "Est"
|
||||||
assert tokens[0].lemma_ == "être"
|
assert tokens[0].lemma_ == "être"
|
||||||
|
assert tokens[1].text == "-ce"
|
||||||
|
assert tokens[1].lemma_ == "ce"
|
||||||
|
|
||||||
|
|
||||||
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
|
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
|
||||||
|
|
|
@ -89,7 +89,7 @@ def test_uk_tokenizer_splits_open_appostrophe(uk_tokenizer, text):
|
||||||
assert tokens[0].text == "'"
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="See #3327")
|
@pytest.mark.skip(reason="See Issue #3327 and PR #3329")
|
||||||
@pytest.mark.parametrize("text", ["Тест''"])
|
@pytest.mark.parametrize("text", ["Тест''"])
|
||||||
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
def test_uk_tokenizer_splits_double_end_quote(uk_tokenizer, text):
|
||||||
tokens = uk_tokenizer(text)
|
tokens = uk_tokenizer(text)
|
||||||
|
|
|
@ -83,7 +83,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
||||||
assert names
|
assert names
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="Maybe outdated? Unsure")
|
@pytest.mark.skip(reason="Maybe outdated? Unsure")
|
||||||
def test_get_oracle_moves_negative_O(tsys, vocab):
|
def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||||
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
||||||
entity_annots = ["O", "!O", "O", "!O"]
|
entity_annots = ["O", "!O", "O", "!O"]
|
||||||
|
@ -95,7 +95,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||||
|
|
||||||
# We can't easily represent this on a Doc object. Not sure what the best solution
|
# We can't easily represent this on a Doc object. Not sure what the best solution
|
||||||
# would be, but I don't think it's an important use case?
|
# would be, but I don't think it's an important use case?
|
||||||
@pytest.mark.xfail(reason="No longer supported")
|
@pytest.mark.skip(reason="No longer supported")
|
||||||
def test_oracle_moves_missing_B(en_vocab):
|
def test_oracle_moves_missing_B(en_vocab):
|
||||||
words = ["B", "52", "Bomber"]
|
words = ["B", "52", "Bomber"]
|
||||||
biluo_tags = [None, None, "L-PRODUCT"]
|
biluo_tags = [None, None, "L-PRODUCT"]
|
||||||
|
@ -121,7 +121,7 @@ def test_oracle_moves_missing_B(en_vocab):
|
||||||
|
|
||||||
# We can't easily represent this on a Doc object. Not sure what the best solution
|
# We can't easily represent this on a Doc object. Not sure what the best solution
|
||||||
# would be, but I don't think it's an important use case?
|
# would be, but I don't think it's an important use case?
|
||||||
@pytest.mark.xfail(reason="No longer supported")
|
@pytest.mark.skip(reason="No longer supported")
|
||||||
def test_oracle_moves_whitespace(en_vocab):
|
def test_oracle_moves_whitespace(en_vocab):
|
||||||
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
|
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
|
||||||
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
|
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
|
||||||
|
|
|
@ -82,13 +82,13 @@ def test_update_doc(parser, model, doc, gold):
|
||||||
parser.update([example], sgd=optimize)
|
parser.update([example], sgd=optimize)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="No longer supported")
|
||||||
def test_predict_doc_beam(parser, model, doc):
|
def test_predict_doc_beam(parser, model, doc):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
parser(doc, beam_width=32, beam_density=0.001)
|
parser(doc, beam_width=32, beam_density=0.001)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="No longer supported")
|
||||||
def test_update_doc_beam(parser, model, doc, gold):
|
def test_update_doc_beam(parser, model, doc, gold):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
|
|
||||||
|
|
|
@ -33,8 +33,8 @@ def test_parser_root(en_tokenizer):
|
||||||
assert t.dep != 0, t.text
|
assert t.dep != 0, t.text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
# @pytest.mark.parametrize("text", ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(
|
doc = get_doc(
|
||||||
|
@ -47,8 +47,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||||
assert doc[0].dep != 0
|
assert doc[0].dep != 0
|
||||||
|
|
||||||
|
|
||||||
# We removed the step_through API a while ago. we should bring it back though
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
@pytest.mark.xfail(reason="Unsupported")
|
|
||||||
def test_parser_initial(en_tokenizer, en_parser):
|
def test_parser_initial(en_tokenizer, en_parser):
|
||||||
text = "I ate the pizza with anchovies."
|
text = "I ate the pizza with anchovies."
|
||||||
# heads = [1, 0, 1, -2, -3, -1, -5]
|
# heads = [1, 0, 1, -2, -3, -1, -5]
|
||||||
|
@ -93,8 +92,7 @@ def test_parser_merge_pp(en_tokenizer):
|
||||||
assert doc[3].text == "occurs"
|
assert doc[3].text == "occurs"
|
||||||
|
|
||||||
|
|
||||||
# We removed the step_through API a while ago. we should bring it back though
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
@pytest.mark.xfail(reason="Unsupported")
|
|
||||||
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||||
text = "a b c d e"
|
text = "a b c d e"
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ def test_parser_sentence_space(en_tokenizer):
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
||||||
text = "\t \n This is a sentence ."
|
text = "\t \n This is a sentence ."
|
||||||
heads = [1, 1, 0, 1, -2, -3]
|
heads = [1, 1, 0, 1, -2, -3]
|
||||||
|
@ -44,7 +44,7 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
||||||
assert stepwise.stack == set([2])
|
assert stepwise.stack == set([2])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||||
text = "This is \t a \t\n \n sentence . \n\n \n"
|
text = "This is \t a \t\n \n sentence . \n\n \n"
|
||||||
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
||||||
|
@ -64,7 +64,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
|
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="The step_through API was removed (but should be brought back)")
|
||||||
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
||||||
doc = Doc(en_parser.vocab, words=text)
|
doc = Doc(en_parser.vocab, words=text)
|
||||||
assert len(doc) == length
|
assert len(doc) == length
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
import pytest
|
import pytest
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
from spacy import util
|
||||||
|
from spacy.gold import Example
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
from spacy.attrs import IS_PUNCT, ORTH, LOWER
|
||||||
from spacy.symbols import POS, VERB
|
from spacy.symbols import POS, VERB
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
from spacy.language import Language
|
from spacy.lang.en import English
|
||||||
from spacy.lemmatizer import Lemmatizer
|
from spacy.lemmatizer import Lemmatizer
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
|
@ -141,14 +144,6 @@ def test_issue588(en_vocab):
|
||||||
matcher.add("TEST", [[]])
|
matcher.add("TEST", [[]])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue589():
|
|
||||||
vocab = Vocab()
|
|
||||||
vocab.strings.set_frozen(True)
|
|
||||||
doc = Doc(vocab, words=["whata"])
|
|
||||||
assert doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue590(en_vocab):
|
def test_issue590(en_vocab):
|
||||||
"""Test overlapping matches"""
|
"""Test overlapping matches"""
|
||||||
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
||||||
|
@ -285,7 +280,7 @@ def test_control_issue792(en_tokenizer, text):
|
||||||
assert "".join([token.text_with_ws for token in doc]) == text
|
assert "".join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(reason="Can not be fixed unless with variable-width lookbehinds, cf. PR #3218")
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,tokens",
|
"text,tokens",
|
||||||
[
|
[
|
||||||
|
@ -417,8 +412,7 @@ def test_issue957(en_tokenizer):
|
||||||
assert doc
|
assert doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
def test_issue999():
|
||||||
def test_issue999(train_data):
|
|
||||||
"""Test that adding entities and resuming training works passably OK.
|
"""Test that adding entities and resuming training works passably OK.
|
||||||
There are two issues here:
|
There are two issues here:
|
||||||
1) We have to re-add labels. This isn't very nice.
|
1) We have to re-add labels. This isn't very nice.
|
||||||
|
@ -432,27 +426,27 @@ def test_issue999(train_data):
|
||||||
["hello", []],
|
["hello", []],
|
||||||
["hi", []],
|
["hi", []],
|
||||||
["i'm looking for a place to eat", []],
|
["i'm looking for a place to eat", []],
|
||||||
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
|
["i'm looking for a place in the north of town", [(31, 36, "LOCATION")]],
|
||||||
["show me chinese restaurants", [[8, 15, "CUISINE"]]],
|
["show me chinese restaurants", [(8, 15, "CUISINE")]],
|
||||||
["show me chines restaurants", [[8, 14, "CUISINE"]]],
|
["show me chines restaurants", [(8, 14, "CUISINE")]],
|
||||||
]
|
]
|
||||||
|
|
||||||
nlp = Language()
|
nlp = English()
|
||||||
ner = nlp.create_pipe("ner")
|
ner = nlp.create_pipe("ner", {"learn_rate": 0.001}) # will need to be {"model": ...} in upcoming PR
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
for _, offsets in TRAIN_DATA:
|
for _, offsets in TRAIN_DATA:
|
||||||
for start, end, label in offsets:
|
for start, end, label in offsets:
|
||||||
ner.add_label(label)
|
ner.add_label(label)
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
ner.model.learn_rate = 0.001
|
for itn in range(20):
|
||||||
for itn in range(100):
|
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
nlp.update((raw_text, {"entities": entity_offsets}))
|
example = Example.from_dict(nlp.make_doc(raw_text), {"entities": entity_offsets})
|
||||||
|
nlp.update([example])
|
||||||
|
|
||||||
with make_tempdir() as model_dir:
|
with make_tempdir() as model_dir:
|
||||||
nlp.to_disk(model_dir)
|
nlp.to_disk(model_dir)
|
||||||
nlp2 = Language().from_disk(model_dir)
|
nlp2 = util.load_model_from_path(model_dir)
|
||||||
|
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
doc = nlp2(raw_text)
|
doc = nlp2(raw_text)
|
||||||
|
|
|
@ -32,8 +32,8 @@ def test_issue1061():
|
||||||
assert "MATH" not in [w.text for w in doc]
|
assert "MATH" not in [w.text for w in doc]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(
|
@pytest.mark.skip(
|
||||||
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
|
reason="Can not be fixed without variable-width look-behind (which we don't want)"
|
||||||
)
|
)
|
||||||
def test_issue1235():
|
def test_issue1235():
|
||||||
"""Test that g is not split of if preceded by a number and a letter"""
|
"""Test that g is not split of if preceded by a number and a letter"""
|
||||||
|
|
|
@ -10,7 +10,9 @@ from spacy.lang.en import English
|
||||||
from ..util import add_vecs_to_vocab, get_doc
|
from ..util import add_vecs_to_vocab, get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip(
|
||||||
|
reason="Can not be fixed without iterative looping between prefix/suffix and infix"
|
||||||
|
)
|
||||||
def test_issue2070():
|
def test_issue2070():
|
||||||
"""Test that checks that a dot followed by a quote is handled
|
"""Test that checks that a dot followed by a quote is handled
|
||||||
appropriately.
|
appropriately.
|
||||||
|
|
|
@ -226,7 +226,7 @@ def test_issue3412():
|
||||||
assert best_rows[0] == 2
|
assert best_rows[0] == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot")
|
@pytest.mark.skip(reason="default suffix rules avoid one upper-case letter before dot")
|
||||||
def test_issue3449():
|
def test_issue3449():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
|
|
@ -178,12 +178,12 @@ def test_issue3549(en_vocab):
|
||||||
matcher.add("BAD", [[{"X": "Y"}]])
|
matcher.add("BAD", [[{"X": "Y"}]])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.skip("Matching currently only works on strings and integers")
|
||||||
def test_issue3555(en_vocab):
|
def test_issue3555(en_vocab):
|
||||||
"""Test that custom extensions with default None don't break matcher."""
|
"""Test that custom extensions with default None don't break matcher."""
|
||||||
Token.set_extension("issue3555", default=None)
|
Token.set_extension("issue3555", default=None)
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
pattern = [{"ORTH": "have"}, {"_": {"issue3555": True}}]
|
||||||
matcher.add("TEST", [pattern])
|
matcher.add("TEST", [pattern])
|
||||||
doc = Doc(en_vocab, words=["have", "apple"])
|
doc = Doc(en_vocab, words=["have", "apple"])
|
||||||
matcher(doc)
|
matcher(doc)
|
||||||
|
|
|
@ -11,7 +11,6 @@ test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||||
default_strings = ("_SP", "POS=SPACE")
|
default_strings = ("_SP", "POS=SPACE")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize("text", ["rat"])
|
@pytest.mark.parametrize("text", ["rat"])
|
||||||
def test_serialize_vocab(en_vocab, text):
|
def test_serialize_vocab(en_vocab, text):
|
||||||
text_hash = en_vocab.strings.add(text)
|
text_hash = en_vocab.strings.add(text)
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.errors import AlignmentError
|
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
from spacy.gold import spans_from_biluo_tags, iob_to_biluo
|
||||||
from spacy.gold import Corpus, docs_to_json
|
from spacy.gold import Corpus, docs_to_json
|
||||||
|
@ -544,42 +543,6 @@ def test_roundtrip_docs_to_docbin(doc):
|
||||||
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||||
|
|
||||||
|
|
||||||
# Hm, not sure where misalignment check would be handled? In the components too?
|
|
||||||
# I guess that does make sense. A text categorizer doesn't care if it's
|
|
||||||
# misaligned...
|
|
||||||
@pytest.mark.xfail(reason="Outdated")
|
|
||||||
def test_ignore_misaligned(doc):
|
|
||||||
nlp = English()
|
|
||||||
text = doc.text
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
json_file = tmpdir / "test.json"
|
|
||||||
data = [docs_to_json(doc)]
|
|
||||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
|
||||||
# write to JSON train dicts
|
|
||||||
srsly.write_json(json_file, data)
|
|
||||||
goldcorpus = Corpus(str(json_file), str(json_file))
|
|
||||||
|
|
||||||
with pytest.raises(AlignmentError):
|
|
||||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
|
||||||
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
json_file = tmpdir / "test.json"
|
|
||||||
data = [docs_to_json(doc)]
|
|
||||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
|
||||||
# write to JSON train dicts
|
|
||||||
srsly.write_json(json_file, data)
|
|
||||||
goldcorpus = Corpus(str(json_file), str(json_file))
|
|
||||||
|
|
||||||
# doesn't raise an AlignmentError, but there is nothing to iterate over
|
|
||||||
# because the only example can't be aligned
|
|
||||||
train_reloaded_example = list(
|
|
||||||
goldcorpus.train_dataset(nlp, ignore_misaligned=True)
|
|
||||||
)
|
|
||||||
assert len(train_reloaded_example) == 0
|
|
||||||
|
|
||||||
|
|
||||||
# We probably want the orth variant logic back, but this test won't be quite
|
|
||||||
# right -- we need to go from DocBin.
|
|
||||||
def test_make_orth_variants(doc):
|
def test_make_orth_variants(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
|
|
|
@ -7,15 +7,26 @@ from spacy.tokens import Doc
|
||||||
from .util import get_batch
|
from .util import get_batch
|
||||||
|
|
||||||
|
|
||||||
# This fails in Thinc v7.3.1. Need to push patch
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_empty_doc():
|
def test_empty_doc():
|
||||||
width = 128
|
width = 128
|
||||||
embed_size = 2000
|
embed_size = 2000
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
doc = Doc(vocab, words=[])
|
doc = Doc(vocab, words=[])
|
||||||
# TODO: fix tok2vec arguments
|
tok2vec = build_Tok2Vec_model(
|
||||||
tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None)
|
width,
|
||||||
|
embed_size,
|
||||||
|
pretrained_vectors=None,
|
||||||
|
conv_depth=4,
|
||||||
|
bilstm_depth=0,
|
||||||
|
window_size=1,
|
||||||
|
maxout_pieces=3,
|
||||||
|
subword_features=True,
|
||||||
|
char_embed=False,
|
||||||
|
nM=64,
|
||||||
|
nC=8,
|
||||||
|
dropout=None,
|
||||||
|
)
|
||||||
|
tok2vec.initialize()
|
||||||
vectors, backprop = tok2vec.begin_update([doc])
|
vectors, backprop = tok2vec.begin_update([doc])
|
||||||
assert len(vectors) == 1
|
assert len(vectors) == 1
|
||||||
assert vectors[0].shape == (0, width)
|
assert vectors[0].shape == (0, width)
|
||||||
|
|
|
@ -95,22 +95,3 @@ def test_stringstore_to_bytes(stringstore, text):
|
||||||
serialized = stringstore.to_bytes()
|
serialized = stringstore.to_bytes()
|
||||||
new_stringstore = StringStore().from_bytes(serialized)
|
new_stringstore = StringStore().from_bytes(serialized)
|
||||||
assert new_stringstore[store] == text
|
assert new_stringstore[store] == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize("text", [["a", "b", "c"]])
|
|
||||||
def test_stringstore_freeze_oov(stringstore, text):
|
|
||||||
"""Test the possibly temporary workaround of flushing the stringstore of
|
|
||||||
OOV words."""
|
|
||||||
assert stringstore[text[0]] == 1
|
|
||||||
assert stringstore[text[1]] == 2
|
|
||||||
|
|
||||||
stringstore.set_frozen(True)
|
|
||||||
s = stringstore[text[2]]
|
|
||||||
assert s >= 4
|
|
||||||
s_ = stringstore[s]
|
|
||||||
assert s_ == text[2]
|
|
||||||
|
|
||||||
stringstore.flush_oov()
|
|
||||||
with pytest.raises(IndexError):
|
|
||||||
s_ = stringstore[s]
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user