mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
8177f25b6c
|
@ -23,6 +23,7 @@ def test_issue2070():
|
|||
assert len(doc) == 11
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2179():
|
||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||
nlp = Italian()
|
||||
|
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
|
|||
assert len(matches) == 3
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2482():
|
||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||
nlp = Italian()
|
||||
|
|
|
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
|
|||
assert doc[0].like_num
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue2800():
|
||||
"""Test issue that arises when too many labels are added to NER model.
|
||||
Used to cause segfault.
|
||||
"""
|
||||
nlp = English()
|
||||
train_data = []
|
||||
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
|
||||
train_data.extend(
|
||||
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
|
||||
)
|
||||
entity_types = [str(i) for i in range(1000)]
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
|
|
|
@ -88,6 +88,7 @@ def test_issue3199():
|
|||
assert list(doc[0:3].noun_chunks) == []
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3209():
|
||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||
mapped to classes incorrectly after loading the model, when the labels
|
||||
|
|
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
|
@ -0,0 +1,472 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import EntityRuler, DependencyParser
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
from spacy import displacy, load
|
||||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.errors import MatchPatternError
|
||||
from spacy.util import minibatch
|
||||
from spacy.gold import Example
|
||||
from spacy.lang.hi import Hindi
|
||||
from spacy.lang.es import Spanish
|
||||
from spacy.lang.en import English
|
||||
from spacy.attrs import IS_ALPHA
|
||||
from thinc.api import compounding
|
||||
import spacy
|
||||
import srsly
|
||||
import numpy
|
||||
|
||||
from ..util import make_tempdir, get_doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
||||
|
||||
|
||||
def test_issue_3526_1(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert ruler.overwrite
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.overwrite == ruler.overwrite
|
||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||
|
||||
|
||||
def test_issue_3526_2(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_issue_3526_3(en_vocab):
|
||||
patterns = [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
with make_tempdir() as tmpdir:
|
||||
out_file = tmpdir / "entity_ruler"
|
||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue_3526_4(en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||
nlp.add_pipe(ruler)
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
ruler = nlp.get_pipe("entity_ruler")
|
||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert ruler.overwrite is True
|
||||
nlp2 = load(tmpdir)
|
||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert new_ruler.overwrite is True
|
||||
|
||||
|
||||
def test_issue3531():
|
||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||
example_dep = {
|
||||
"words": [
|
||||
{"text": "But", "tag": "CCONJ"},
|
||||
{"text": "Google", "tag": "PROPN"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "starting", "tag": "VERB"},
|
||||
{"text": "from", "tag": "ADP"},
|
||||
{"text": "behind.", "tag": "ADV"},
|
||||
],
|
||||
"arcs": [
|
||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||
],
|
||||
}
|
||||
example_ent = {
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
}
|
||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||
assert dep_html
|
||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||
assert ent_html
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = numpy.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||||
|
||||
|
||||
def test_issue3549(en_vocab):
|
||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
matcher.add("GOOD", [pattern])
|
||||
with pytest.raises(MatchPatternError):
|
||||
matcher.add("BAD", [[{"X": "Y"}]])
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
||||
|
||||
|
||||
def test_issue3611():
|
||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
nlp = spacy.blank("en")
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||
nlp = Hindi()
|
||||
doc = nlp("hi. how हुए. होटल, होटल")
|
||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||
assert [token.text for token in doc] == expected
|
||||
|
||||
|
||||
def test_issue3803():
|
||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||
nlp = Spanish()
|
||||
text = "2 dos 1000 mil 12 doce"
|
||||
doc = nlp(text)
|
||||
|
||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" in parser.labels
|
||||
|
||||
|
||||
def test_issue3839(en_vocab):
|
||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||
matcher = Matcher(en_vocab)
|
||||
match_id = "PATTERN"
|
||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
matcher.add(match_id, [pattern1])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add(match_id, [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence",
|
||||
[
|
||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||
],
|
||||
)
|
||||
def test_issue3869(sentence):
|
||||
"""Test that the Doc's count_by function works consistently"""
|
||||
nlp = English()
|
||||
doc = nlp(sentence)
|
||||
count = 0
|
||||
for token in doc:
|
||||
count += token.is_alpha
|
||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||
|
||||
|
||||
def test_issue3879(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
assert len(doc) == 5
|
||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [pattern])
|
||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
Fixed in v7.0.5 of Thinc.
|
||||
"""
|
||||
texts = ["hello", "world", "", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("parser").add_label("dep")
|
||||
nlp.get_pipe("ner").add_label("PERSON")
|
||||
nlp.get_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
||||
|
||||
|
||||
def test_issue3951(en_vocab):
|
||||
"""Test that combinations of optional rules are matched correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"LOWER": "hello"},
|
||||
{"LOWER": "this", "OP": "?"},
|
||||
{"OP": "?"},
|
||||
{"LOWER": "world"},
|
||||
]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
assert doc2[0].pos_ == "NOUN"
|
||||
|
||||
|
||||
def test_issue3962(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
# head set to itself, being the new artificial root
|
||||
assert doc2[0].head.text == "jests"
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||
assert doc2[3].dep_ == "dep"
|
||||
# We should still have 1 sentence
|
||||
assert len(list(doc2.sents)) == 1
|
||||
span3 = doc[6:9] # "never felt a"
|
||||
doc3 = span3.as_doc()
|
||||
doc3_json = doc3.to_json()
|
||||
assert doc3_json
|
||||
assert doc3[0].head.text == "felt"
|
||||
assert doc3[0].dep_ == "neg"
|
||||
assert doc3[1].head.text == "felt"
|
||||
assert doc3[1].dep_ == "ROOT"
|
||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||
assert doc3[2].dep_ == "dep"
|
||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||
assert len(list(doc3.sents)) == 1
|
||||
|
||||
|
||||
def test_issue3962_long(en_vocab):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
# fmt: off
|
||||
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||
# fmt: on
|
||||
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
# head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].head.text == "jests"
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
# head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].head.text == "They"
|
||||
assert doc2[4].dep_ == "dep"
|
||||
# head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].head.text == "They"
|
||||
assert doc2[4].dep_ == "dep"
|
||||
# We should still have 2 sentences
|
||||
sents = list(doc2.sents)
|
||||
assert len(sents) == 2
|
||||
assert sents[0].text == "jests at scars ."
|
||||
assert sents[1].text == "They never"
|
||||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert len(matches) == 2
|
||||
|
||||
# We should have a match for each of the two rules
|
||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||
assert "A" in found_ids
|
||||
assert "B" in found_ids
|
|
@ -1,8 +0,0 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
|
@ -1,85 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Span
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy import load
|
||||
import srsly
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patterns():
|
||||
return [
|
||||
{"label": "HELLO", "pattern": "hello world"},
|
||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def add_ent():
|
||||
def add_ent_component(doc):
|
||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
||||
return doc
|
||||
|
||||
return add_ent_component
|
||||
|
||||
|
||||
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
ruler_bytes = ruler.to_bytes()
|
||||
assert len(ruler) == len(patterns)
|
||||
assert len(ruler.labels) == 4
|
||||
assert ruler.overwrite
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert len(new_ruler.labels) == 4
|
||||
assert new_ruler.overwrite == ruler.overwrite
|
||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||
|
||||
|
||||
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp)
|
||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||
assert len(new_ruler) == len(ruler)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||
with make_tempdir() as tmpdir:
|
||||
out_file = tmpdir / "entity_ruler"
|
||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||
for pattern in ruler.patterns:
|
||||
assert pattern in new_ruler.patterns
|
||||
assert len(new_ruler) == len(ruler)
|
||||
assert new_ruler.overwrite is not ruler.overwrite
|
||||
|
||||
|
||||
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
|
||||
nlp = Language(vocab=en_vocab)
|
||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
|
||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||
nlp.add_pipe(ruler)
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir)
|
||||
ruler = nlp.get_pipe("entity_ruler")
|
||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert ruler.overwrite is True
|
||||
nlp2 = load(tmpdir)
|
||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||
assert new_ruler.overwrite is True
|
|
@ -1,30 +0,0 @@
|
|||
from spacy import displacy
|
||||
|
||||
|
||||
def test_issue3531():
|
||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||
example_dep = {
|
||||
"words": [
|
||||
{"text": "But", "tag": "CCONJ"},
|
||||
{"text": "Google", "tag": "PROPN"},
|
||||
{"text": "is", "tag": "VERB"},
|
||||
{"text": "starting", "tag": "VERB"},
|
||||
{"text": "from", "tag": "ADP"},
|
||||
{"text": "behind.", "tag": "ADV"},
|
||||
],
|
||||
"arcs": [
|
||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||
],
|
||||
}
|
||||
example_ent = {
|
||||
"text": "But Google is starting from behind.",
|
||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||
}
|
||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||
assert dep_html
|
||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||
assert ent_html
|
|
@ -1,44 +0,0 @@
|
|||
from spacy.tokens import Doc
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def test_issue3540(en_vocab):
|
||||
|
||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
tensor = np.asarray(
|
||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||
dtype="f",
|
||||
)
|
||||
doc = Doc(en_vocab, words=words)
|
||||
doc.tensor = tensor
|
||||
|
||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_1 = [token.vector for token in doc]
|
||||
assert len(vectors_1) == len(doc)
|
||||
|
||||
with doc.retokenize() as retokenizer:
|
||||
heads = [(doc[3], 1), doc[2]]
|
||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
|
||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.text for token in doc] == gold_text
|
||||
|
||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||
assert [token.lemma_ for token in doc] == gold_lemma
|
||||
|
||||
vectors_2 = [token.vector for token in doc]
|
||||
assert len(vectors_2) == len(doc)
|
||||
|
||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||
|
||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
@ -1,12 +0,0 @@
|
|||
import pytest
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.errors import MatchPatternError
|
||||
|
||||
|
||||
def test_issue3549(en_vocab):
|
||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||
matcher = Matcher(en_vocab, validate=True)
|
||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||
matcher.add("GOOD", [pattern])
|
||||
with pytest.raises(MatchPatternError):
|
||||
matcher.add("BAD", [[{"X": "Y"}]])
|
|
@ -1,14 +0,0 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc, Token
|
||||
from spacy.matcher import Matcher
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3555(en_vocab):
|
||||
"""Test that custom extensions with default None don't break matcher."""
|
||||
Token.set_extension("issue3555", default=None)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["have", "apple"])
|
||||
matcher(doc)
|
|
@ -1,45 +0,0 @@
|
|||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
def test_issue3611():
|
||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
|
@ -1,9 +0,0 @@
|
|||
from spacy.lang.hi import Hindi
|
||||
|
||||
|
||||
def test_issue3625():
|
||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||
nlp = Hindi()
|
||||
doc = nlp("hi. how हुए. होटल, होटल")
|
||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||
assert [token.text for token in doc] == expected
|
|
@ -1,10 +0,0 @@
|
|||
from spacy.lang.es import Spanish
|
||||
|
||||
|
||||
def test_issue3803():
|
||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||
nlp = Spanish()
|
||||
text = "2 dos 1000 mil 12 doce"
|
||||
doc = nlp(text)
|
||||
|
||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.pipeline.pipes import DependencyParser
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.pipeline.defaults import default_parser
|
||||
|
||||
|
||||
def test_issue3830_no_subtok():
|
||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" not in parser.labels
|
||||
|
||||
|
||||
def test_issue3830_with_subtok():
|
||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||
config = {
|
||||
"learn_tokens": True,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||
parser.add_label("nsubj")
|
||||
assert "subtok" not in parser.labels
|
||||
parser.begin_training(lambda: [])
|
||||
assert "subtok" in parser.labels
|
|
@ -1,18 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3839(en_vocab):
|
||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||
matcher = Matcher(en_vocab)
|
||||
match_id = "PATTERN"
|
||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||
matcher.add(match_id, [pattern1])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add(match_id, [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
@ -1,25 +0,0 @@
|
|||
import pytest
|
||||
from spacy.attrs import IS_ALPHA
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sentence",
|
||||
[
|
||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||
],
|
||||
)
|
||||
def test_issue3869(sentence):
|
||||
"""Test that the Doc's count_by function works consistently"""
|
||||
nlp = English()
|
||||
doc = nlp(sentence)
|
||||
|
||||
count = 0
|
||||
for token in doc:
|
||||
count += token.is_alpha
|
||||
|
||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
|
@ -1,11 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3879(en_vocab):
|
||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||
assert len(doc) == 5
|
||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [pattern])
|
||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
|
@ -1,21 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue3880():
|
||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||
|
||||
Fixed in v7.0.5 of Thinc.
|
||||
"""
|
||||
texts = ["hello", "world", "", ""]
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.get_pipe("parser").add_label("dep")
|
||||
nlp.get_pipe("ner").add_label("PERSON")
|
||||
nlp.get_pipe("tagger").add_label("NN")
|
||||
nlp.begin_training()
|
||||
for doc in nlp.pipe(texts):
|
||||
pass
|
|
@ -1,12 +0,0 @@
|
|||
from spacy.displacy import parse_deps
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3882(en_vocab):
|
||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||
copy of the Doc.
|
||||
"""
|
||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||
doc.is_parsed = True
|
||||
doc.user_data["test"] = set()
|
||||
parse_deps(doc)
|
|
@ -1,17 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3951(en_vocab):
|
||||
"""Test that combinations of optional rules are matched correctly."""
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [
|
||||
{"LOWER": "hello"},
|
||||
{"LOWER": "this", "OP": "?"},
|
||||
{"OP": "?"},
|
||||
{"LOWER": "world"},
|
||||
]
|
||||
matcher.add("TEST", [pattern])
|
||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 0
|
|
@ -1,26 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
|
||||
assert doc2[0].pos_ == "NOUN"
|
|
@ -1,117 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def doc(en_tokenizer):
|
||||
text = "He jests at scars, that never felt a wound."
|
||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||
deps = [
|
||||
"nsubj",
|
||||
"ccomp",
|
||||
"prep",
|
||||
"pobj",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"neg",
|
||||
"ROOT",
|
||||
"det",
|
||||
"dobj",
|
||||
"punct",
|
||||
]
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_issue3962(doc):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
span2 = doc[1:5] # "jests at scars ,"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root
|
||||
assert doc2[0].dep_ == "dep"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||
assert doc2[3].dep_ == "dep"
|
||||
|
||||
# We should still have 1 sentence
|
||||
assert len(list(doc2.sents)) == 1
|
||||
|
||||
span3 = doc[6:9] # "never felt a"
|
||||
doc3 = span3.as_doc()
|
||||
doc3_json = doc3.to_json()
|
||||
assert doc3_json
|
||||
|
||||
assert doc3[0].head.text == "felt"
|
||||
assert doc3[0].dep_ == "neg"
|
||||
assert doc3[1].head.text == "felt"
|
||||
assert doc3[1].dep_ == "ROOT"
|
||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||
assert doc3[2].dep_ == "dep"
|
||||
|
||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||
assert len(list(doc3.sents)) == 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def two_sent_doc(en_tokenizer):
|
||||
text = "He jests at scars. They never felt a wound."
|
||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||
deps = [
|
||||
"nsubj",
|
||||
"ROOT",
|
||||
"prep",
|
||||
"pobj",
|
||||
"punct",
|
||||
"nsubj",
|
||||
"neg",
|
||||
"ROOT",
|
||||
"det",
|
||||
"dobj",
|
||||
"punct",
|
||||
]
|
||||
tokens = en_tokenizer(text)
|
||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||
|
||||
|
||||
def test_issue3962_long(two_sent_doc):
|
||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||
doc2 = span2.as_doc()
|
||||
doc2_json = doc2.to_json()
|
||||
assert doc2_json
|
||||
|
||||
assert (
|
||||
doc2[0].head.text == "jests"
|
||||
) # head set to itself, being the new artificial root (in sentence 1)
|
||||
assert doc2[0].dep_ == "ROOT"
|
||||
assert doc2[1].head.text == "jests"
|
||||
assert doc2[1].dep_ == "prep"
|
||||
assert doc2[2].head.text == "at"
|
||||
assert doc2[2].dep_ == "pobj"
|
||||
assert doc2[3].head.text == "jests"
|
||||
assert doc2[3].dep_ == "punct"
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to itself, being the new artificial root (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
assert (
|
||||
doc2[4].head.text == "They"
|
||||
) # head set to the new artificial head (in sentence 2)
|
||||
assert doc2[4].dep_ == "dep"
|
||||
|
||||
# We should still have 2 sentences
|
||||
sents = list(doc2.sents)
|
||||
assert len(sents) == 2
|
||||
assert sents[0].text == "jests at scars ."
|
||||
assert sents[1].text == "They never"
|
|
@ -1,19 +0,0 @@
|
|||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue3972(en_vocab):
|
||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab)
|
||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||
matches = matcher(doc)
|
||||
|
||||
assert len(matches) == 2
|
||||
|
||||
# We should have a match for each of the two rules
|
||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||
assert "A" in found_ids
|
||||
assert "B" in found_ids
|
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
|
@ -0,0 +1,469 @@
|
|||
import pytest
|
||||
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.matcher import PhraseMatcher, Matcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.gold import Example, Corpus
|
||||
from spacy.gold.converters import json2docs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch, ensure_path, load_model
|
||||
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy.lang.el import Greek
|
||||
from spacy.language import Language
|
||||
import spacy
|
||||
from thinc.api import compounding
|
||||
from collections import defaultdict
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern1])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||
pattern2[0].norm_ = "c"
|
||||
pattern2[1].norm_ = "d"
|
||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
|
||||
|
||||
def test_issue4030():
|
||||
""" Test whether textcat works fine with empty doc """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
nlp = spacy.blank("en")
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
assert doc.cats["inoffensive"] == 0.0
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = load_model(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
||||
|
||||
def test_issue4054(en_vocab):
|
||||
"""Test that a new blank model can be made with a vocab from file,
|
||||
and that serialization does not drop the language at any point."""
|
||||
nlp1 = English()
|
||||
vocab1 = nlp1.vocab
|
||||
with make_tempdir() as d:
|
||||
vocab_dir = ensure_path(d / "vocab")
|
||||
if not vocab_dir.exists():
|
||||
vocab_dir.mkdir()
|
||||
vocab1.to_disk(vocab_dir)
|
||||
vocab2 = Vocab().from_disk(vocab_dir)
|
||||
print("lang", vocab2.lang)
|
||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||
nlp_dir = ensure_path(d / "nlp")
|
||||
if not nlp_dir.exists():
|
||||
nlp_dir.mkdir()
|
||||
nlp2.to_disk(nlp_dir)
|
||||
nlp3 = load_model(nlp_dir)
|
||||
assert nlp3.lang == "en"
|
||||
|
||||
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc3)) == 2 # works
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
doc_bytes = doc.to_bytes()
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
assert actual == pos
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
nlp.tokenizer = new_tokenizer
|
||||
|
||||
test_string = "Test c."
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
||||
|
||||
|
||||
def test_multiple_predictions():
|
||||
class DummyPipe(Pipe):
|
||||
def __init__(self):
|
||||
self.model = "dummy_model"
|
||||
|
||||
def predict(self, docs):
|
||||
return ([1, 2, 3], [4, 5, 6])
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
return docs
|
||||
|
||||
nlp = Language()
|
||||
doc = nlp.make_doc("foo")
|
||||
dummy_pipe = DummyPipe()
|
||||
dummy_pipe(doc)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
nlp.add_pipe(tagger)
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||
|
||||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||
|
||||
|
||||
def test_issue4373():
|
||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||
matcher = Matcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
matcher = PhraseMatcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
json_data = {
|
||||
"id": 0,
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "How", "ner": "O"},
|
||||
{"id": 1, "orth": "should", "ner": "O"},
|
||||
{"id": 2, "orth": "I", "ner": "O"},
|
||||
{"id": 3, "orth": "cook", "ner": "O"},
|
||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||
{"id": 5, "orth": "in", "ner": "O"},
|
||||
{"id": 6, "orth": "an", "ner": "O"},
|
||||
{"id": 7, "orth": "oven", "ner": "O"},
|
||||
{"id": 8, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 9, "orth": "\n", "ner": "O"},
|
||||
{"id": 10, "orth": "I", "ner": "O"},
|
||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||
{"id": 12, "orth": "heard", "ner": "O"},
|
||||
{"id": 13, "orth": "of", "ner": "O"},
|
||||
{"id": 14, "orth": "people", "ner": "O"},
|
||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||
{"id": 17, "orth": "in", "ner": "O"},
|
||||
{"id": 18, "orth": "an", "ner": "O"},
|
||||
{"id": 19, "orth": "oven", "ner": "O"},
|
||||
{"id": 20, "orth": ".", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 1.0},
|
||||
{"label": "not_baking", "value": 0.0},
|
||||
],
|
||||
},
|
||||
{
|
||||
"raw": "What is the difference between white and brown eggs?\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "What", "ner": "O"},
|
||||
{"id": 1, "orth": "is", "ner": "O"},
|
||||
{"id": 2, "orth": "the", "ner": "O"},
|
||||
{"id": 3, "orth": "difference", "ner": "O"},
|
||||
{"id": 4, "orth": "between", "ner": "O"},
|
||||
{"id": 5, "orth": "white", "ner": "O"},
|
||||
{"id": 6, "orth": "and", "ner": "O"},
|
||||
{"id": 7, "orth": "brown", "ner": "O"},
|
||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||
{"id": 9, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 0.0},
|
||||
{"label": "not_baking", "value": 1.0},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
nlp = English()
|
||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
|
||||
train_data = list(corpus.train_dataset(nlp))
|
||||
assert len(train_data) == 2
|
||||
|
||||
split_train_data = []
|
||||
for eg in train_data:
|
||||
split_train_data.extend(eg.split_sents())
|
||||
assert len(split_train_data) == 4
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.matcher import PhraseMatcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue4002(en_vocab):
|
||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||
"""
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern1])
|
||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||
pattern2[0].norm_ = "c"
|
||||
pattern2[1].norm_ = "d"
|
||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||
matcher.add("TEST", [pattern2])
|
||||
matches = matcher(doc)
|
||||
assert len(matches) == 1
|
|
@ -1,50 +0,0 @@
|
|||
import spacy
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
def test_issue4030():
|
||||
""" Test whether textcat works fine with empty doc """
|
||||
unique_classes = ["offensive", "inoffensive"]
|
||||
x_train = [
|
||||
"This is an offensive text",
|
||||
"This is the second offensive text",
|
||||
"inoff",
|
||||
]
|
||||
y_train = ["offensive", "offensive", "inoffensive"]
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
|
||||
# preparing the data
|
||||
train_data = []
|
||||
for text, train_instance in zip(x_train, y_train):
|
||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||
|
||||
# add a text categorizer component
|
||||
textcat = nlp.create_pipe(
|
||||
"textcat",
|
||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||
)
|
||||
|
||||
for label in unique_classes:
|
||||
textcat.add_label(label)
|
||||
nlp.add_pipe(textcat, last=True)
|
||||
|
||||
# training the network
|
||||
with nlp.select_pipes(enable="textcat"):
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(3):
|
||||
losses = {}
|
||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||
|
||||
for batch in batches:
|
||||
nlp.update(
|
||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||
)
|
||||
|
||||
# processing of an empty doc should result in 0.0 for all categories
|
||||
doc = nlp("")
|
||||
assert doc.cats["offensive"] == 0.0
|
||||
assert doc.cats["inoffensive"] == 0.0
|
|
@ -1,85 +0,0 @@
|
|||
import spacy
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import ensure_path
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = spacy.load(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
|
@ -1,30 +0,0 @@
|
|||
from spacy.vocab import Vocab
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import ensure_path
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4054(en_vocab):
|
||||
"""Test that a new blank model can be made with a vocab from file,
|
||||
and that serialization does not drop the language at any point."""
|
||||
nlp1 = English()
|
||||
vocab1 = nlp1.vocab
|
||||
|
||||
with make_tempdir() as d:
|
||||
vocab_dir = ensure_path(d / "vocab")
|
||||
if not vocab_dir.exists():
|
||||
vocab_dir.mkdir()
|
||||
vocab1.to_disk(vocab_dir)
|
||||
|
||||
vocab2 = Vocab().from_disk(vocab_dir)
|
||||
print("lang", vocab2.lang)
|
||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||
|
||||
nlp_dir = ensure_path(d / "nlp")
|
||||
if not nlp_dir.exists():
|
||||
nlp_dir.mkdir()
|
||||
nlp2.to_disk(nlp_dir)
|
||||
nlp3 = spacy.load(nlp_dir)
|
||||
assert nlp3.lang == "en"
|
|
@ -1,23 +0,0 @@
|
|||
from spacy.matcher import Matcher
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_issue4120(en_vocab):
|
||||
"""Test that matches without a final {OP: ?} token are returned."""
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||
doc1 = Doc(en_vocab, words=["a"])
|
||||
assert len(matcher(doc1)) == 1 # works
|
||||
|
||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||
assert len(matcher(doc2)) == 2 # fixed
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc3)) == 2 # works
|
||||
|
||||
matcher = Matcher(en_vocab)
|
||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||
assert len(matcher(doc4)) == 3 # fixed
|
|
@ -1,28 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
doc_bytes = doc.to_bytes()
|
||||
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
|
||||
assert actual == pos
|
|
@ -1,46 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokenizer import Tokenizer
|
||||
from spacy import util
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4190():
|
||||
test_string = "Test c."
|
||||
# Load default language
|
||||
nlp_1 = English()
|
||||
doc_1a = nlp_1(test_string)
|
||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||
# Modify tokenizer
|
||||
customize_tokenizer(nlp_1)
|
||||
doc_1b = nlp_1(test_string)
|
||||
result_1b = [token.text for token in doc_1b]
|
||||
# Save and Reload
|
||||
with make_tempdir() as model_dir:
|
||||
nlp_1.to_disk(model_dir)
|
||||
nlp_2 = util.load_model(model_dir)
|
||||
# This should be the modified tokenizer
|
||||
doc_2 = nlp_2(test_string)
|
||||
result_2 = [token.text for token in doc_2]
|
||||
assert result_1b == result_2
|
||||
|
||||
|
||||
def customize_tokenizer(nlp):
|
||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||
exceptions = {
|
||||
k: v
|
||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||
if not (len(k) == 2 and k[1] == ".")
|
||||
}
|
||||
new_tokenizer = Tokenizer(
|
||||
nlp.vocab,
|
||||
exceptions,
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=nlp.tokenizer.token_match,
|
||||
)
|
||||
nlp.tokenizer = new_tokenizer
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4267():
|
||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||
nlp = English()
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("PEOPLE")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we have correct IOB annotations
|
||||
doc1 = nlp("hi")
|
||||
assert doc1.is_nered
|
||||
for token in doc1:
|
||||
assert token.ent_iob == 2
|
||||
|
||||
# add entity ruler and run again
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
assert "entity_ruler" in nlp.pipe_names
|
||||
assert "ner" in nlp.pipe_names
|
||||
|
||||
# assert that we still have correct IOB annotations
|
||||
doc2 = nlp("hi")
|
||||
assert doc2.is_nered
|
||||
for token in doc2:
|
||||
assert token.ent_iob == 2
|
|
@ -1,9 +0,0 @@
|
|||
from spacy.lang.el import Greek
|
||||
|
||||
|
||||
def test_issue4272():
|
||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||
are available."""
|
||||
nlp = Greek()
|
||||
doc = nlp("Χθες")
|
||||
assert doc[0].lemma_
|
|
@ -1,25 +0,0 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.pipeline import Pipe
|
||||
|
||||
|
||||
class DummyPipe(Pipe):
|
||||
def __init__(self):
|
||||
self.model = "dummy_model"
|
||||
|
||||
def predict(self, docs):
|
||||
return ([1, 2, 3], [4, 5, 6])
|
||||
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
return docs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return Language()
|
||||
|
||||
|
||||
def test_multiple_predictions(nlp):
|
||||
doc = nlp.make_doc("foo")
|
||||
dummy_pipe = DummyPipe()
|
||||
dummy_pipe(doc)
|
|
@ -1,47 +0,0 @@
|
|||
from collections import defaultdict
|
||||
|
||||
import pytest
|
||||
|
||||
from spacy.pipeline.defaults import default_ner
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
||||
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
||||
@pytest.mark.skip
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
config = {
|
||||
"learn_tokens": False,
|
||||
"min_action_freq": 30,
|
||||
"beam_width": 1,
|
||||
"beam_update_prob": 1.0,
|
||||
}
|
||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
|
@ -1,24 +0,0 @@
|
|||
from spacy.gold import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch
|
||||
from thinc.api import compounding
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4348():
|
||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||
|
||||
nlp = English()
|
||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||
TRAIN_DATA = [example, example]
|
||||
|
||||
tagger = nlp.create_pipe("tagger")
|
||||
nlp.add_pipe(tagger)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
for i in range(5):
|
||||
losses = {}
|
||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||
for batch in batches:
|
||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
@ -1,8 +0,0 @@
|
|||
from spacy.tokens import DocBin
|
||||
|
||||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
|
@ -1,10 +0,0 @@
|
|||
from spacy.matcher import Matcher, PhraseMatcher
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4373():
|
||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||
matcher = Matcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
||||
matcher = PhraseMatcher(Vocab())
|
||||
assert isinstance(matcher.vocab, Vocab)
|
|
@ -1,98 +0,0 @@
|
|||
from spacy.gold import Corpus
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...gold.converters import json2docs
|
||||
from ...tokens import DocBin
|
||||
|
||||
|
||||
def test_issue4402():
|
||||
nlp = English()
|
||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||
|
||||
train_data = list(corpus.train_dataset(nlp))
|
||||
assert len(train_data) == 2
|
||||
|
||||
split_train_data = []
|
||||
for eg in train_data:
|
||||
split_train_data.extend(eg.split_sents())
|
||||
assert len(split_train_data) == 4
|
||||
|
||||
|
||||
json_data = {
|
||||
"id": 0,
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "How", "ner": "O"},
|
||||
{"id": 1, "orth": "should", "ner": "O"},
|
||||
{"id": 2, "orth": "I", "ner": "O"},
|
||||
{"id": 3, "orth": "cook", "ner": "O"},
|
||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||
{"id": 5, "orth": "in", "ner": "O"},
|
||||
{"id": 6, "orth": "an", "ner": "O"},
|
||||
{"id": 7, "orth": "oven", "ner": "O"},
|
||||
{"id": 8, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 9, "orth": "\n", "ner": "O"},
|
||||
{"id": 10, "orth": "I", "ner": "O"},
|
||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||
{"id": 12, "orth": "heard", "ner": "O"},
|
||||
{"id": 13, "orth": "of", "ner": "O"},
|
||||
{"id": 14, "orth": "people", "ner": "O"},
|
||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||
{"id": 17, "orth": "in", "ner": "O"},
|
||||
{"id": 18, "orth": "an", "ner": "O"},
|
||||
{"id": 19, "orth": "oven", "ner": "O"},
|
||||
{"id": 20, "orth": ".", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 1.0},
|
||||
{"label": "not_baking", "value": 0.0},
|
||||
],
|
||||
},
|
||||
{
|
||||
"raw": "What is the difference between white and brown eggs?\n",
|
||||
"sentences": [
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 0, "orth": "What", "ner": "O"},
|
||||
{"id": 1, "orth": "is", "ner": "O"},
|
||||
{"id": 2, "orth": "the", "ner": "O"},
|
||||
{"id": 3, "orth": "difference", "ner": "O"},
|
||||
{"id": 4, "orth": "between", "ner": "O"},
|
||||
{"id": 5, "orth": "white", "ner": "O"},
|
||||
{"id": 6, "orth": "and", "ner": "O"},
|
||||
{"id": 7, "orth": "brown", "ner": "O"},
|
||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||
{"id": 9, "orth": "?", "ner": "O"},
|
||||
],
|
||||
"brackets": [],
|
||||
},
|
||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||
],
|
||||
"cats": [
|
||||
{"label": "baking", "value": 0.0},
|
||||
{"label": "not_baking", "value": 1.0},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
|
@ -0,0 +1,288 @@
|
|||
import pytest
|
||||
from mock import Mock
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.gold import Example
|
||||
from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.language import Language
|
||||
from spacy.util import ensure_path, load_model_from_path
|
||||
import numpy
|
||||
import pickle
|
||||
|
||||
from ..util import get_doc, make_tempdir
|
||||
|
||||
|
||||
def test_issue4528(en_vocab):
|
||||
"""Test that user_data is correctly serialized in DocBin."""
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
# This is how extension attribute values are stored in the user data
|
||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
doc_bin.add(doc)
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||
)
|
||||
def test_gold_misaligned(en_tokenizer, text, words):
|
||||
doc = en_tokenizer(text)
|
||||
Example.from_dict(doc, {"words": words})
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
matches = matcher(doc)
|
||||
on_match_args = on_match.call_args
|
||||
assert on_match_args[0][3] == matches
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
input_data = """
|
||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||
2 This _ DET DT _ _ det _ _
|
||||
3 killing _ NOUN NN _ _ nsubj _ _
|
||||
4 of _ ADP IN _ _ case _ _
|
||||
5 a _ DET DT _ _ det _ _
|
||||
6 respected _ ADJ JJ _ _ amod _ _
|
||||
7 cleric _ NOUN NN _ _ nmod _ _
|
||||
8 will _ AUX MD _ _ aux _ _
|
||||
9 be _ AUX VB _ _ aux _ _
|
||||
10 causing _ VERB VBG _ _ root _ _
|
||||
11 us _ PRON PRP _ _ iobj _ _
|
||||
12 trouble _ NOUN NN _ _ dobj _ _
|
||||
13 for _ ADP IN _ _ case _ _
|
||||
14 years _ NOUN NNS _ _ nmod _ _
|
||||
15 to _ PART TO _ _ mark _ _
|
||||
16 come _ VERB VB _ _ acl _ _
|
||||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
conllu2docs(input_data)
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
kb.set_entities(
|
||||
entity_list=["Q1", "Q1"],
|
||||
freq_list=[32, 111],
|
||||
vector_list=[vector1, vector2],
|
||||
)
|
||||
assert kb.get_size_entities() == 1
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
assert kb2.get_size_entities() == 1
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4725_1():
|
||||
""" Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue4725_2():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
ruler = EntityRuler(
|
||||
nlp,
|
||||
patterns=[
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
],
|
||||
phrase_matcher_attr="LOWER",
|
||||
)
|
||||
nlp.add_pipe(ruler)
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
name = "my_pipe"
|
||||
|
||||
def __init__(self):
|
||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||
Doc.set_extension("my_ext", default=None)
|
||||
|
||||
def __call__(self, doc):
|
||||
gathered_ext = []
|
||||
for sent in doc.sents:
|
||||
sent_ext = self._get_my_ext(sent)
|
||||
sent._.set("my_ext", sent_ext)
|
||||
gathered_ext.append(sent_ext)
|
||||
|
||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _get_my_ext(span):
|
||||
return str(span.end)
|
||||
|
||||
|
||||
def test_issue4903():
|
||||
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
|
||||
macOS."""
|
||||
nlp = English()
|
||||
custom_component = CustomPipe()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(custom_component, after="sentencizer")
|
||||
|
||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||
docs = list(nlp.pipe(text, n_process=2))
|
||||
assert docs[0].text == "I like bananas."
|
||||
assert docs[1].text == "Do you like them?"
|
||||
assert docs[2].text == "No, I prefer wasabi."
|
||||
|
||||
|
||||
def test_issue4924():
|
||||
nlp = Language()
|
||||
example = Example.from_dict(nlp.make_doc(""), {})
|
||||
nlp.evaluate([example])
|
|
@ -1,16 +0,0 @@
|
|||
from spacy.tokens import Doc, DocBin
|
||||
|
||||
|
||||
def test_issue4528(en_vocab):
|
||||
"""Test that user_data is correctly serialized in DocBin."""
|
||||
doc = Doc(en_vocab, words=["hello", "world"])
|
||||
doc.user_data["foo"] = "bar"
|
||||
# This is how extension attribute values are stored in the user data
|
||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
doc_bin.add(doc)
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||
assert new_doc.user_data["foo"] == "bar"
|
||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
|
@ -1,11 +0,0 @@
|
|||
import pytest
|
||||
|
||||
from spacy.gold import Example
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||
)
|
||||
def test_gold_misaligned(en_tokenizer, text, words):
|
||||
doc = en_tokenizer(text)
|
||||
Example.from_dict(doc, {"words": words})
|
|
@ -1,35 +0,0 @@
|
|||
from mock import Mock
|
||||
from spacy.matcher import DependencyMatcher
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
def test_issue4590(en_vocab):
|
||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||
pattern = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
]
|
||||
|
||||
on_match = Mock()
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern", on_match, pattern)
|
||||
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||
|
||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||
|
||||
matches = matcher(doc)
|
||||
|
||||
on_match_args = on_match.call_args
|
||||
|
||||
assert on_match_args[0][3] == matches
|
|
@ -1,62 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4651_with_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
||||
|
||||
|
||||
def test_issue4651_without_phrase_matcher_attr():
|
||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||
not specified.
|
||||
"""
|
||||
text = "Spacy is a python library for nlp"
|
||||
|
||||
nlp = English()
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
doc = nlp(text)
|
||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||
|
||||
nlp_reloaded = English()
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "entityruler"
|
||||
ruler.to_disk(file_path)
|
||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||
|
||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||
doc_reloaded = nlp_reloaded(text)
|
||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||
|
||||
assert res == res_reloaded
|
|
@ -1,35 +0,0 @@
|
|||
import pytest
|
||||
|
||||
# TODO
|
||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
||||
|
||||
input_data = """
|
||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||
2 This _ DET DT _ _ det _ _
|
||||
3 killing _ NOUN NN _ _ nsubj _ _
|
||||
4 of _ ADP IN _ _ case _ _
|
||||
5 a _ DET DT _ _ det _ _
|
||||
6 respected _ ADJ JJ _ _ amod _ _
|
||||
7 cleric _ NOUN NN _ _ nmod _ _
|
||||
8 will _ AUX MD _ _ aux _ _
|
||||
9 be _ AUX VB _ _ aux _ _
|
||||
10 causing _ VERB VBG _ _ root _ _
|
||||
11 us _ PRON PRP _ _ iobj _ _
|
||||
12 trouble _ NOUN NN _ _ dobj _ _
|
||||
13 for _ ADP IN _ _ case _ _
|
||||
14 years _ NOUN NNS _ _ nmod _ _
|
||||
15 to _ PART TO _ _ mark _ _
|
||||
16 come _ VERB VB _ _ acl _ _
|
||||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
pass
|
||||
# conllu2json(input_data)
|
|
@ -1,36 +0,0 @@
|
|||
import pytest
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.util import ensure_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||
nlp = English()
|
||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||
|
||||
vector1 = [0.9, 1.1, 1.01]
|
||||
vector2 = [1.8, 2.25, 2.01]
|
||||
with pytest.warns(UserWarning):
|
||||
kb.set_entities(
|
||||
entity_list=["Q1", "Q1"],
|
||||
freq_list=[32, 111],
|
||||
vector_list=[vector1, vector2],
|
||||
)
|
||||
|
||||
assert kb.get_size_entities() == 1
|
||||
|
||||
# dumping to file & loading back in
|
||||
with make_tempdir() as d:
|
||||
dir_path = ensure_path(d)
|
||||
if not dir_path.exists():
|
||||
dir_path.mkdir()
|
||||
file_path = dir_path / "kb"
|
||||
kb.dump(str(file_path))
|
||||
|
||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||
kb2.load_bulk(str(file_path))
|
||||
|
||||
assert kb2.get_size_entities() == 1
|
|
@ -1,20 +0,0 @@
|
|||
from spacy.util import load_model_from_path
|
||||
from spacy.lang.en import English
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
"""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||
exclude = ["tokenizer", "sentencizer"]
|
||||
with make_tempdir() as tmpdir:
|
||||
nlp.to_disk(tmpdir, exclude=exclude)
|
||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||
assert "sentencizer" not in new_nlp.pipe_names
|
||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
@ -1,41 +0,0 @@
|
|||
import pickle
|
||||
import numpy
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from spacy.tests.util import make_tempdir
|
||||
|
||||
|
||||
def test_pickle_ner():
|
||||
""" Ensure the pickling of the NER goes well"""
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||
with make_tempdir() as tmp_path:
|
||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||
pickle.dump(ner, file_)
|
||||
assert ner.cfg["min_action_freq"] == 342
|
||||
|
||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||
ner2 = pickle.load(file_)
|
||||
assert ner2.cfg["min_action_freq"] == 342
|
||||
|
||||
|
||||
def test_issue4725():
|
||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||
data = numpy.ndarray((5, 3), dtype="f")
|
||||
data[0] = 1.0
|
||||
data[1] = 2.0
|
||||
vocab.set_vector("cat", data[0])
|
||||
vocab.set_vector("dog", data[1])
|
||||
|
||||
nlp = English(vocab=vocab)
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
docs = ["Kurt is in London."] * 10
|
||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||
pass
|
|
@ -1,34 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
|
||||
|
||||
def test_issue4849():
|
||||
nlp = English()
|
||||
|
||||
ruler = EntityRuler(
|
||||
nlp,
|
||||
patterns=[
|
||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||
],
|
||||
phrase_matcher_attr="LOWER",
|
||||
)
|
||||
|
||||
nlp.add_pipe(ruler)
|
||||
|
||||
text = """
|
||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||
"""
|
||||
|
||||
# USING 1 PROCESS
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=1):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
||||
|
||||
# USING 2 PROCESSES
|
||||
count_ents = 0
|
||||
for doc in nlp.pipe([text], n_process=2):
|
||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||
assert count_ents == 2
|
|
@ -1,40 +0,0 @@
|
|||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span, Doc
|
||||
|
||||
|
||||
class CustomPipe:
|
||||
name = "my_pipe"
|
||||
|
||||
def __init__(self):
|
||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||
Doc.set_extension("my_ext", default=None)
|
||||
|
||||
def __call__(self, doc):
|
||||
gathered_ext = []
|
||||
for sent in doc.sents:
|
||||
sent_ext = self._get_my_ext(sent)
|
||||
sent._.set("my_ext", sent_ext)
|
||||
gathered_ext.append(sent_ext)
|
||||
|
||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def _get_my_ext(span):
|
||||
return str(span.end)
|
||||
|
||||
|
||||
def test_issue4903():
|
||||
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
||||
|
||||
nlp = English()
|
||||
custom_component = CustomPipe()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
nlp.add_pipe(custom_component, after="sentencizer")
|
||||
|
||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||
docs = list(nlp.pipe(text, n_process=2))
|
||||
assert docs[0].text == "I like bananas."
|
||||
assert docs[1].text == "Do you like them?"
|
||||
assert docs[2].text == "No, I prefer wasabi."
|
|
@ -1,8 +0,0 @@
|
|||
from spacy.gold import Example
|
||||
from spacy.language import Language
|
||||
|
||||
|
||||
def test_issue4924():
|
||||
nlp = Language()
|
||||
example = Example.from_dict(nlp.make_doc(""), {})
|
||||
nlp.evaluate([example])
|
|
@ -1,6 +1,8 @@
|
|||
import pytest
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_issue5152():
|
||||
# Test that the comparison between a Span and a Token, goes well
|
||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||
|
@ -8,7 +10,6 @@ def test_issue5152():
|
|||
text = nlp("Talk about being boring!")
|
||||
text_var = nlp("Talk of being boring!")
|
||||
y = nlp("Let")
|
||||
|
||||
span = text[0:3] # Talk about being
|
||||
span_2 = text[0:3] # Talk about being
|
||||
span_3 = text_var[0:3] # Talk of being
|
||||
|
|
|
@ -63,7 +63,8 @@ def tagger():
|
|||
# need to add model for two reasons:
|
||||
# 1. no model leads to error in serialization,
|
||||
# 2. the affected line is the one for model serialization
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
with pytest.warns(UserWarning):
|
||||
tagger.begin_training(pipeline=nlp.pipeline)
|
||||
return tagger
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user