mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
8177f25b6c
|
@ -23,6 +23,7 @@ def test_issue2070():
|
||||||
assert len(doc) == 11
|
assert len(doc) == 11
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2179():
|
def test_issue2179():
|
||||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
@ -134,6 +135,7 @@ def test_issue2464(en_vocab):
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2482():
|
def test_issue2482():
|
||||||
"""Test we can serialize and deserialize a blank NER or parser model."""
|
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
|
|
|
@ -138,13 +138,16 @@ def test_issue2782(text, lang_cls):
|
||||||
assert doc[0].like_num
|
assert doc[0].like_num
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue2800():
|
def test_issue2800():
|
||||||
"""Test issue that arises when too many labels are added to NER model.
|
"""Test issue that arises when too many labels are added to NER model.
|
||||||
Used to cause segfault.
|
Used to cause segfault.
|
||||||
"""
|
"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
train_data = []
|
train_data = []
|
||||||
train_data.extend([Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})])
|
train_data.extend(
|
||||||
|
[Example.from_dict(nlp.make_doc("One sentence"), {"entities": []})]
|
||||||
|
)
|
||||||
entity_types = [str(i) for i in range(1000)]
|
entity_types = [str(i) for i in range(1000)]
|
||||||
ner = nlp.create_pipe("ner")
|
ner = nlp.create_pipe("ner")
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
|
|
|
@ -88,6 +88,7 @@ def test_issue3199():
|
||||||
assert list(doc[0:3].noun_chunks) == []
|
assert list(doc[0:3].noun_chunks) == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue3209():
|
def test_issue3209():
|
||||||
"""Test issue that occurred in spaCy nightly where NER labels were being
|
"""Test issue that occurred in spaCy nightly where NER labels were being
|
||||||
mapped to classes incorrectly after loading the model, when the labels
|
mapped to classes incorrectly after loading the model, when the labels
|
||||||
|
|
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
472
spacy/tests/regression/test_issue3501-4000.py
Normal file
|
@ -0,0 +1,472 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.pipeline import EntityRuler, DependencyParser
|
||||||
|
from spacy.pipeline.defaults import default_parser
|
||||||
|
from spacy import displacy, load
|
||||||
|
from spacy.displacy import parse_deps
|
||||||
|
from spacy.tokens import Doc, Token
|
||||||
|
from spacy.matcher import Matcher, PhraseMatcher
|
||||||
|
from spacy.errors import MatchPatternError
|
||||||
|
from spacy.util import minibatch
|
||||||
|
from spacy.gold import Example
|
||||||
|
from spacy.lang.hi import Hindi
|
||||||
|
from spacy.lang.es import Spanish
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.attrs import IS_ALPHA
|
||||||
|
from thinc.api import compounding
|
||||||
|
import spacy
|
||||||
|
import srsly
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from ..util import make_tempdir, get_doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
||||||
|
def test_issue3521(en_tokenizer, word):
|
||||||
|
tok = en_tokenizer(word)[1]
|
||||||
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
|
assert tok.is_stop
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_1(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
ruler_bytes = ruler.to_bytes()
|
||||||
|
assert len(ruler) == len(patterns)
|
||||||
|
assert len(ruler.labels) == 4
|
||||||
|
assert ruler.overwrite
|
||||||
|
new_ruler = EntityRuler(nlp)
|
||||||
|
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
assert len(new_ruler.labels) == 4
|
||||||
|
assert new_ruler.overwrite == ruler.overwrite
|
||||||
|
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_2(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
||||||
|
new_ruler = EntityRuler(nlp)
|
||||||
|
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
for pattern in ruler.patterns:
|
||||||
|
assert pattern in new_ruler.patterns
|
||||||
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue_3526_3(en_vocab):
|
||||||
|
patterns = [
|
||||||
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
|
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
||||||
|
]
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
out_file = tmpdir / "entity_ruler"
|
||||||
|
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
||||||
|
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
||||||
|
for pattern in ruler.patterns:
|
||||||
|
assert pattern in new_ruler.patterns
|
||||||
|
assert len(new_ruler) == len(ruler)
|
||||||
|
assert new_ruler.overwrite is not ruler.overwrite
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue_3526_4(en_vocab):
|
||||||
|
nlp = Language(vocab=en_vocab)
|
||||||
|
ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||||
|
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
nlp.to_disk(tmpdir)
|
||||||
|
ruler = nlp.get_pipe("entity_ruler")
|
||||||
|
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
assert ruler.overwrite is True
|
||||||
|
nlp2 = load(tmpdir)
|
||||||
|
new_ruler = nlp2.get_pipe("entity_ruler")
|
||||||
|
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
||||||
|
assert new_ruler.overwrite is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3531():
|
||||||
|
"""Test that displaCy renderer doesn't require "settings" key."""
|
||||||
|
example_dep = {
|
||||||
|
"words": [
|
||||||
|
{"text": "But", "tag": "CCONJ"},
|
||||||
|
{"text": "Google", "tag": "PROPN"},
|
||||||
|
{"text": "is", "tag": "VERB"},
|
||||||
|
{"text": "starting", "tag": "VERB"},
|
||||||
|
{"text": "from", "tag": "ADP"},
|
||||||
|
{"text": "behind.", "tag": "ADV"},
|
||||||
|
],
|
||||||
|
"arcs": [
|
||||||
|
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
||||||
|
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
||||||
|
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
example_ent = {
|
||||||
|
"text": "But Google is starting from behind.",
|
||||||
|
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
||||||
|
}
|
||||||
|
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
||||||
|
assert dep_html
|
||||||
|
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
||||||
|
assert ent_html
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3540(en_vocab):
|
||||||
|
words = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
tensor = numpy.asarray(
|
||||||
|
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
||||||
|
dtype="f",
|
||||||
|
)
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
doc.tensor = tensor
|
||||||
|
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
assert [token.text for token in doc] == gold_text
|
||||||
|
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
||||||
|
assert [token.lemma_ for token in doc] == gold_lemma
|
||||||
|
vectors_1 = [token.vector for token in doc]
|
||||||
|
assert len(vectors_1) == len(doc)
|
||||||
|
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
heads = [(doc[3], 1), doc[2]]
|
||||||
|
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
||||||
|
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||||
|
|
||||||
|
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
|
assert [token.text for token in doc] == gold_text
|
||||||
|
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
||||||
|
assert [token.lemma_ for token in doc] == gold_lemma
|
||||||
|
vectors_2 = [token.vector for token in doc]
|
||||||
|
assert len(vectors_2) == len(doc)
|
||||||
|
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
||||||
|
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
||||||
|
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
||||||
|
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
||||||
|
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3549(en_vocab):
|
||||||
|
"""Test that match pattern validation doesn't raise on empty errors."""
|
||||||
|
matcher = Matcher(en_vocab, validate=True)
|
||||||
|
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
||||||
|
matcher.add("GOOD", [pattern])
|
||||||
|
with pytest.raises(MatchPatternError):
|
||||||
|
matcher.add("BAD", [[{"X": "Y"}]])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_issue3555(en_vocab):
|
||||||
|
"""Test that custom extensions with default None don't break matcher."""
|
||||||
|
Token.set_extension("issue3555", default=None)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["have", "apple"])
|
||||||
|
matcher(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3611():
|
||||||
|
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff",
|
||||||
|
]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
# preparing the data
|
||||||
|
train_data = []
|
||||||
|
for text, train_instance in zip(x_train, y_train):
|
||||||
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||||
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||||
|
# add a text categorizer component
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
|
)
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
# training the network
|
||||||
|
with nlp.select_pipes(enable="textcat"):
|
||||||
|
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(
|
||||||
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3625():
|
||||||
|
"""Test that default punctuation rules applies to hindi unicode characters"""
|
||||||
|
nlp = Hindi()
|
||||||
|
doc = nlp("hi. how हुए. होटल, होटल")
|
||||||
|
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
||||||
|
assert [token.text for token in doc] == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3803():
|
||||||
|
"""Test that spanish num-like tokens have True for like_num attribute."""
|
||||||
|
nlp = Spanish()
|
||||||
|
text = "2 dos 1000 mil 12 doce"
|
||||||
|
doc = nlp(text)
|
||||||
|
|
||||||
|
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3830_no_subtok():
|
||||||
|
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
parser.begin_training(lambda: [])
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3830_with_subtok():
|
||||||
|
"""Test that the parser does have subtok label if learn_tokens=True."""
|
||||||
|
config = {
|
||||||
|
"learn_tokens": True,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
parser = DependencyParser(Vocab(), default_parser(), **config)
|
||||||
|
parser.add_label("nsubj")
|
||||||
|
assert "subtok" not in parser.labels
|
||||||
|
parser.begin_training(lambda: [])
|
||||||
|
assert "subtok" in parser.labels
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3839(en_vocab):
|
||||||
|
"""Test that match IDs returned by the matcher are correct, are in the string """
|
||||||
|
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
match_id = "PATTERN"
|
||||||
|
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
|
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
||||||
|
matcher.add(match_id, [pattern1])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add(match_id, [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert matches[0][0] == en_vocab.strings[match_id]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"sentence",
|
||||||
|
[
|
||||||
|
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
||||||
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
||||||
|
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
||||||
|
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
||||||
|
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_issue3869(sentence):
|
||||||
|
"""Test that the Doc's count_by function works consistently"""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(sentence)
|
||||||
|
count = 0
|
||||||
|
for token in doc:
|
||||||
|
count += token.is_alpha
|
||||||
|
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3879(en_vocab):
|
||||||
|
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
||||||
|
assert len(doc) == 5
|
||||||
|
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue3880():
|
||||||
|
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
||||||
|
|
||||||
|
Fixed in v7.0.5 of Thinc.
|
||||||
|
"""
|
||||||
|
texts = ["hello", "world", "", ""]
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
|
nlp.get_pipe("parser").add_label("dep")
|
||||||
|
nlp.get_pipe("ner").add_label("PERSON")
|
||||||
|
nlp.get_pipe("tagger").add_label("NN")
|
||||||
|
nlp.begin_training()
|
||||||
|
for doc in nlp.pipe(texts):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3882(en_vocab):
|
||||||
|
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
||||||
|
copy of the Doc.
|
||||||
|
"""
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "world"])
|
||||||
|
doc.is_parsed = True
|
||||||
|
doc.user_data["test"] = set()
|
||||||
|
parse_deps(doc)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3951(en_vocab):
|
||||||
|
"""Test that combinations of optional rules are matched correctly."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [
|
||||||
|
{"LOWER": "hello"},
|
||||||
|
{"LOWER": "this", "OP": "?"},
|
||||||
|
{"OP": "?"},
|
||||||
|
{"LOWER": "world"},
|
||||||
|
]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3959():
|
||||||
|
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(
|
||||||
|
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||||
|
)
|
||||||
|
assert doc[0].pos_ == ""
|
||||||
|
doc[0].pos_ = "NOUN"
|
||||||
|
assert doc[0].pos_ == "NOUN"
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = tmp_dir / "my_doc"
|
||||||
|
doc.to_disk(file_path)
|
||||||
|
doc2 = nlp("")
|
||||||
|
doc2.from_disk(file_path)
|
||||||
|
assert doc2[0].pos_ == "NOUN"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962(en_vocab):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
# fmt: off
|
||||||
|
words = ["He", "jests", "at", "scars", ",", "that", "never", "felt", "a", "wound", "."]
|
||||||
|
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = ["nsubj", "ccomp", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
span2 = doc[1:5] # "jests at scars ,"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
# head set to itself, being the new artificial root
|
||||||
|
assert doc2[0].head.text == "jests"
|
||||||
|
assert doc2[0].dep_ == "dep"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
||||||
|
assert doc2[3].dep_ == "dep"
|
||||||
|
# We should still have 1 sentence
|
||||||
|
assert len(list(doc2.sents)) == 1
|
||||||
|
span3 = doc[6:9] # "never felt a"
|
||||||
|
doc3 = span3.as_doc()
|
||||||
|
doc3_json = doc3.to_json()
|
||||||
|
assert doc3_json
|
||||||
|
assert doc3[0].head.text == "felt"
|
||||||
|
assert doc3[0].dep_ == "neg"
|
||||||
|
assert doc3[1].head.text == "felt"
|
||||||
|
assert doc3[1].dep_ == "ROOT"
|
||||||
|
assert doc3[2].head.text == "felt" # head set to ancestor
|
||||||
|
assert doc3[2].dep_ == "dep"
|
||||||
|
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
||||||
|
assert len(list(doc3.sents)) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3962_long(en_vocab):
|
||||||
|
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
||||||
|
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
||||||
|
# fmt: off
|
||||||
|
words = ["He", "jests", "at", "scars", ".", "They", "never", "felt", "a", "wound", "."]
|
||||||
|
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
||||||
|
deps = ["nsubj", "ROOT", "prep", "pobj", "punct", "nsubj", "neg", "ROOT", "det", "dobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
two_sent_doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
|
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
||||||
|
doc2 = span2.as_doc()
|
||||||
|
doc2_json = doc2.to_json()
|
||||||
|
assert doc2_json
|
||||||
|
# head set to itself, being the new artificial root (in sentence 1)
|
||||||
|
assert doc2[0].head.text == "jests"
|
||||||
|
assert doc2[0].dep_ == "ROOT"
|
||||||
|
assert doc2[1].head.text == "jests"
|
||||||
|
assert doc2[1].dep_ == "prep"
|
||||||
|
assert doc2[2].head.text == "at"
|
||||||
|
assert doc2[2].dep_ == "pobj"
|
||||||
|
assert doc2[3].head.text == "jests"
|
||||||
|
assert doc2[3].dep_ == "punct"
|
||||||
|
# head set to itself, being the new artificial root (in sentence 2)
|
||||||
|
assert doc2[4].head.text == "They"
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
# head set to the new artificial head (in sentence 2)
|
||||||
|
assert doc2[4].head.text == "They"
|
||||||
|
assert doc2[4].dep_ == "dep"
|
||||||
|
# We should still have 2 sentences
|
||||||
|
sents = list(doc2.sents)
|
||||||
|
assert len(sents) == 2
|
||||||
|
assert sents[0].text == "jests at scars ."
|
||||||
|
assert sents[1].text == "They never"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3972(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab)
|
||||||
|
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
||||||
|
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
||||||
|
matches = matcher(doc)
|
||||||
|
|
||||||
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
# We should have a match for each of the two rules
|
||||||
|
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
||||||
|
assert "A" in found_ids
|
||||||
|
assert "B" in found_ids
|
|
@ -1,8 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("word", ["don't", "don’t", "I'd", "I’d"])
|
|
||||||
def test_issue3521(en_tokenizer, word):
|
|
||||||
tok = en_tokenizer(word)[1]
|
|
||||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
|
||||||
assert tok.is_stop
|
|
|
@ -1,85 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
from spacy import load
|
|
||||||
import srsly
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def patterns():
|
|
||||||
return [
|
|
||||||
{"label": "HELLO", "pattern": "hello world"},
|
|
||||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
|
||||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
|
||||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
|
||||||
{"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def add_ent():
|
|
||||||
def add_ent_component(doc):
|
|
||||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
|
||||||
return doc
|
|
||||||
|
|
||||||
return add_ent_component
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_overwrite_serialize_bytes(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
ruler_bytes = ruler.to_bytes()
|
|
||||||
assert len(ruler) == len(patterns)
|
|
||||||
assert len(ruler.labels) == 4
|
|
||||||
assert ruler.overwrite
|
|
||||||
new_ruler = EntityRuler(nlp)
|
|
||||||
new_ruler = new_ruler.from_bytes(ruler_bytes)
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
assert len(new_ruler.labels) == 4
|
|
||||||
assert new_ruler.overwrite == ruler.overwrite
|
|
||||||
assert new_ruler.ent_id_sep == ruler.ent_id_sep
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_bytes_old_format_safe(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
bytes_old_style = srsly.msgpack_dumps(ruler.patterns)
|
|
||||||
new_ruler = EntityRuler(nlp)
|
|
||||||
new_ruler = new_ruler.from_bytes(bytes_old_style)
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
for pattern in ruler.patterns:
|
|
||||||
assert pattern in new_ruler.patterns
|
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
out_file = tmpdir / "entity_ruler"
|
|
||||||
srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns)
|
|
||||||
new_ruler = EntityRuler(nlp).from_disk(out_file)
|
|
||||||
for pattern in ruler.patterns:
|
|
||||||
assert pattern in new_ruler.patterns
|
|
||||||
assert len(new_ruler) == len(ruler)
|
|
||||||
assert new_ruler.overwrite is not ruler.overwrite
|
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab):
|
|
||||||
nlp = Language(vocab=en_vocab)
|
|
||||||
ruler = EntityRuler(nlp, overwrite_ents=True)
|
|
||||||
|
|
||||||
ruler.add_patterns([{"label": "ORG", "pattern": "Apple"}])
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
nlp.to_disk(tmpdir)
|
|
||||||
ruler = nlp.get_pipe("entity_ruler")
|
|
||||||
assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
|
||||||
assert ruler.overwrite is True
|
|
||||||
nlp2 = load(tmpdir)
|
|
||||||
new_ruler = nlp2.get_pipe("entity_ruler")
|
|
||||||
assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}]
|
|
||||||
assert new_ruler.overwrite is True
|
|
|
@ -1,30 +0,0 @@
|
||||||
from spacy import displacy
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3531():
|
|
||||||
"""Test that displaCy renderer doesn't require "settings" key."""
|
|
||||||
example_dep = {
|
|
||||||
"words": [
|
|
||||||
{"text": "But", "tag": "CCONJ"},
|
|
||||||
{"text": "Google", "tag": "PROPN"},
|
|
||||||
{"text": "is", "tag": "VERB"},
|
|
||||||
{"text": "starting", "tag": "VERB"},
|
|
||||||
{"text": "from", "tag": "ADP"},
|
|
||||||
{"text": "behind.", "tag": "ADV"},
|
|
||||||
],
|
|
||||||
"arcs": [
|
|
||||||
{"start": 0, "end": 3, "label": "cc", "dir": "left"},
|
|
||||||
{"start": 1, "end": 3, "label": "nsubj", "dir": "left"},
|
|
||||||
{"start": 2, "end": 3, "label": "aux", "dir": "left"},
|
|
||||||
{"start": 3, "end": 4, "label": "prep", "dir": "right"},
|
|
||||||
{"start": 4, "end": 5, "label": "pcomp", "dir": "right"},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
example_ent = {
|
|
||||||
"text": "But Google is starting from behind.",
|
|
||||||
"ents": [{"start": 4, "end": 10, "label": "ORG"}],
|
|
||||||
}
|
|
||||||
dep_html = displacy.render(example_dep, style="dep", manual=True)
|
|
||||||
assert dep_html
|
|
||||||
ent_html = displacy.render(example_ent, style="ent", manual=True)
|
|
||||||
assert ent_html
|
|
|
@ -1,44 +0,0 @@
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3540(en_vocab):
|
|
||||||
|
|
||||||
words = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
tensor = np.asarray(
|
|
||||||
[[1.0, 1.1], [2.0, 2.1], [3.0, 3.1], [4.0, 4.1], [5.0, 5.1], [6.0, 6.1]],
|
|
||||||
dtype="f",
|
|
||||||
)
|
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
doc.tensor = tensor
|
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
assert [token.text for token in doc] == gold_text
|
|
||||||
|
|
||||||
gold_lemma = ["I", "live", "in", "NewYork", "right", "now"]
|
|
||||||
assert [token.lemma_ for token in doc] == gold_lemma
|
|
||||||
|
|
||||||
vectors_1 = [token.vector for token in doc]
|
|
||||||
assert len(vectors_1) == len(doc)
|
|
||||||
|
|
||||||
with doc.retokenize() as retokenizer:
|
|
||||||
heads = [(doc[3], 1), doc[2]]
|
|
||||||
attrs = {"POS": ["PROPN", "PROPN"], "DEP": ["pobj", "compound"]}
|
|
||||||
retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
|
||||||
|
|
||||||
gold_text = ["I", "live", "in", "New", "York", "right", "now"]
|
|
||||||
assert [token.text for token in doc] == gold_text
|
|
||||||
|
|
||||||
gold_lemma = ["I", "live", "in", "New", "York", "right", "now"]
|
|
||||||
assert [token.lemma_ for token in doc] == gold_lemma
|
|
||||||
|
|
||||||
vectors_2 = [token.vector for token in doc]
|
|
||||||
assert len(vectors_2) == len(doc)
|
|
||||||
|
|
||||||
assert vectors_1[0].tolist() == vectors_2[0].tolist()
|
|
||||||
assert vectors_1[1].tolist() == vectors_2[1].tolist()
|
|
||||||
assert vectors_1[2].tolist() == vectors_2[2].tolist()
|
|
||||||
|
|
||||||
assert vectors_1[4].tolist() == vectors_2[5].tolist()
|
|
||||||
assert vectors_1[5].tolist() == vectors_2[6].tolist()
|
|
|
@ -1,12 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.errors import MatchPatternError
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3549(en_vocab):
|
|
||||||
"""Test that match pattern validation doesn't raise on empty errors."""
|
|
||||||
matcher = Matcher(en_vocab, validate=True)
|
|
||||||
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
|
|
||||||
matcher.add("GOOD", [pattern])
|
|
||||||
with pytest.raises(MatchPatternError):
|
|
||||||
matcher.add("BAD", [[{"X": "Y"}]])
|
|
|
@ -1,14 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.tokens import Doc, Token
|
|
||||||
from spacy.matcher import Matcher
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue3555(en_vocab):
|
|
||||||
"""Test that custom extensions with default None don't break matcher."""
|
|
||||||
Token.set_extension("issue3555", default=None)
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
doc = Doc(en_vocab, words=["have", "apple"])
|
|
||||||
matcher(doc)
|
|
|
@ -1,45 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3611():
|
|
||||||
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
|
||||||
x_train = [
|
|
||||||
"This is an offensive text",
|
|
||||||
"This is the second offensive text",
|
|
||||||
"inoff",
|
|
||||||
]
|
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
|
|
||||||
# preparing the data
|
|
||||||
train_data = []
|
|
||||||
for text, train_instance in zip(x_train, y_train):
|
|
||||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
||||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
||||||
|
|
||||||
# add a text categorizer component
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat",
|
|
||||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
||||||
)
|
|
||||||
|
|
||||||
for label in unique_classes:
|
|
||||||
textcat.add_label(label)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
|
|
||||||
# training the network
|
|
||||||
with nlp.select_pipes(enable="textcat"):
|
|
||||||
optimizer = nlp.begin_training(X=x_train, Y=y_train)
|
|
||||||
for i in range(3):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(
|
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
|
@ -1,9 +0,0 @@
|
||||||
from spacy.lang.hi import Hindi
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3625():
|
|
||||||
"""Test that default punctuation rules applies to hindi unicode characters"""
|
|
||||||
nlp = Hindi()
|
|
||||||
doc = nlp("hi. how हुए. होटल, होटल")
|
|
||||||
expected = ["hi", ".", "how", "हुए", ".", "होटल", ",", "होटल"]
|
|
||||||
assert [token.text for token in doc] == expected
|
|
|
@ -1,10 +0,0 @@
|
||||||
from spacy.lang.es import Spanish
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3803():
|
|
||||||
"""Test that spanish num-like tokens have True for like_num attribute."""
|
|
||||||
nlp = Spanish()
|
|
||||||
text = "2 dos 1000 mil 12 doce"
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
assert [t.like_num for t in doc] == [True, True, True, True, True, True]
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.pipeline.pipes import DependencyParser
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_parser
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_no_subtok():
|
|
||||||
"""Test that the parser doesn't have subtok label if not learn_tokens"""
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
|
||||||
parser.add_label("nsubj")
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
parser.begin_training(lambda: [])
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3830_with_subtok():
|
|
||||||
"""Test that the parser does have subtok label if learn_tokens=True."""
|
|
||||||
config = {
|
|
||||||
"learn_tokens": True,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
parser = DependencyParser(Vocab(), default_parser(), **config)
|
|
||||||
parser.add_label("nsubj")
|
|
||||||
assert "subtok" not in parser.labels
|
|
||||||
parser.begin_training(lambda: [])
|
|
||||||
assert "subtok" in parser.labels
|
|
|
@ -1,18 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3839(en_vocab):
|
|
||||||
"""Test that match IDs returned by the matcher are correct, are in the string """
|
|
||||||
doc = Doc(en_vocab, words=["terrific", "group", "of", "people"])
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
match_id = "PATTERN"
|
|
||||||
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
|
|
||||||
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
|
|
||||||
matcher.add(match_id, [pattern1])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add(match_id, [pattern2])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert matches[0][0] == en_vocab.strings[match_id]
|
|
|
@ -1,25 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.attrs import IS_ALPHA
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"sentence",
|
|
||||||
[
|
|
||||||
"The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.",
|
|
||||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's #1.",
|
|
||||||
"The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale's number one",
|
|
||||||
"Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.",
|
|
||||||
"It was a missed assignment, but it shouldn't have resulted in a turnover ...",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_issue3869(sentence):
|
|
||||||
"""Test that the Doc's count_by function works consistently"""
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(sentence)
|
|
||||||
|
|
||||||
count = 0
|
|
||||||
for token in doc:
|
|
||||||
count += token.is_alpha
|
|
||||||
|
|
||||||
assert count == doc.count_by(IS_ALPHA).get(1, 0)
|
|
|
@ -1,11 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3879(en_vocab):
|
|
||||||
doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
|
|
||||||
assert len(doc) == 5
|
|
||||||
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
|
|
|
@ -1,21 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue3880():
|
|
||||||
"""Test that `nlp.pipe()` works when an empty string ends the batch.
|
|
||||||
|
|
||||||
Fixed in v7.0.5 of Thinc.
|
|
||||||
"""
|
|
||||||
texts = ["hello", "world", "", ""]
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("ner"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
|
||||||
nlp.get_pipe("parser").add_label("dep")
|
|
||||||
nlp.get_pipe("ner").add_label("PERSON")
|
|
||||||
nlp.get_pipe("tagger").add_label("NN")
|
|
||||||
nlp.begin_training()
|
|
||||||
for doc in nlp.pipe(texts):
|
|
||||||
pass
|
|
|
@ -1,12 +0,0 @@
|
||||||
from spacy.displacy import parse_deps
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3882(en_vocab):
|
|
||||||
"""Test that displaCy doesn't serialize the doc.user_data when making a
|
|
||||||
copy of the Doc.
|
|
||||||
"""
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "world"])
|
|
||||||
doc.is_parsed = True
|
|
||||||
doc.user_data["test"] = set()
|
|
||||||
parse_deps(doc)
|
|
|
@ -1,17 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3951(en_vocab):
|
|
||||||
"""Test that combinations of optional rules are matched correctly."""
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
pattern = [
|
|
||||||
{"LOWER": "hello"},
|
|
||||||
{"LOWER": "this", "OP": "?"},
|
|
||||||
{"OP": "?"},
|
|
||||||
{"LOWER": "world"},
|
|
||||||
]
|
|
||||||
matcher.add("TEST", [pattern])
|
|
||||||
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 0
|
|
|
@ -1,26 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3959():
|
|
||||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp(
|
|
||||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
|
||||||
)
|
|
||||||
assert doc[0].pos_ == ""
|
|
||||||
|
|
||||||
doc[0].pos_ = "NOUN"
|
|
||||||
assert doc[0].pos_ == "NOUN"
|
|
||||||
|
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
with make_tempdir() as tmp_dir:
|
|
||||||
file_path = tmp_dir / "my_doc"
|
|
||||||
doc.to_disk(file_path)
|
|
||||||
|
|
||||||
doc2 = nlp("")
|
|
||||||
doc2.from_disk(file_path)
|
|
||||||
|
|
||||||
assert doc2[0].pos_ == "NOUN"
|
|
|
@ -1,117 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def doc(en_tokenizer):
|
|
||||||
text = "He jests at scars, that never felt a wound."
|
|
||||||
heads = [1, 6, -1, -1, 3, 2, 1, 0, 1, -2, -3]
|
|
||||||
deps = [
|
|
||||||
"nsubj",
|
|
||||||
"ccomp",
|
|
||||||
"prep",
|
|
||||||
"pobj",
|
|
||||||
"punct",
|
|
||||||
"nsubj",
|
|
||||||
"neg",
|
|
||||||
"ROOT",
|
|
||||||
"det",
|
|
||||||
"dobj",
|
|
||||||
"punct",
|
|
||||||
]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962(doc):
|
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
||||||
span2 = doc[1:5] # "jests at scars ,"
|
|
||||||
doc2 = span2.as_doc()
|
|
||||||
doc2_json = doc2.to_json()
|
|
||||||
assert doc2_json
|
|
||||||
|
|
||||||
assert (
|
|
||||||
doc2[0].head.text == "jests"
|
|
||||||
) # head set to itself, being the new artificial root
|
|
||||||
assert doc2[0].dep_ == "dep"
|
|
||||||
assert doc2[1].head.text == "jests"
|
|
||||||
assert doc2[1].dep_ == "prep"
|
|
||||||
assert doc2[2].head.text == "at"
|
|
||||||
assert doc2[2].dep_ == "pobj"
|
|
||||||
assert doc2[3].head.text == "jests" # head set to the new artificial root
|
|
||||||
assert doc2[3].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 1 sentence
|
|
||||||
assert len(list(doc2.sents)) == 1
|
|
||||||
|
|
||||||
span3 = doc[6:9] # "never felt a"
|
|
||||||
doc3 = span3.as_doc()
|
|
||||||
doc3_json = doc3.to_json()
|
|
||||||
assert doc3_json
|
|
||||||
|
|
||||||
assert doc3[0].head.text == "felt"
|
|
||||||
assert doc3[0].dep_ == "neg"
|
|
||||||
assert doc3[1].head.text == "felt"
|
|
||||||
assert doc3[1].dep_ == "ROOT"
|
|
||||||
assert doc3[2].head.text == "felt" # head set to ancestor
|
|
||||||
assert doc3[2].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 1 sentence as "a" can be attached to "felt" instead of "wound"
|
|
||||||
assert len(list(doc3.sents)) == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def two_sent_doc(en_tokenizer):
|
|
||||||
text = "He jests at scars. They never felt a wound."
|
|
||||||
heads = [1, 0, -1, -1, -3, 2, 1, 0, 1, -2, -3]
|
|
||||||
deps = [
|
|
||||||
"nsubj",
|
|
||||||
"ROOT",
|
|
||||||
"prep",
|
|
||||||
"pobj",
|
|
||||||
"punct",
|
|
||||||
"nsubj",
|
|
||||||
"neg",
|
|
||||||
"ROOT",
|
|
||||||
"det",
|
|
||||||
"dobj",
|
|
||||||
"punct",
|
|
||||||
]
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3962_long(two_sent_doc):
|
|
||||||
""" Ensure that as_doc does not result in out-of-bound access of tokens.
|
|
||||||
This is achieved by setting the head to itself if it would lie out of the span otherwise."""
|
|
||||||
span2 = two_sent_doc[1:7] # "jests at scars. They never"
|
|
||||||
doc2 = span2.as_doc()
|
|
||||||
doc2_json = doc2.to_json()
|
|
||||||
assert doc2_json
|
|
||||||
|
|
||||||
assert (
|
|
||||||
doc2[0].head.text == "jests"
|
|
||||||
) # head set to itself, being the new artificial root (in sentence 1)
|
|
||||||
assert doc2[0].dep_ == "ROOT"
|
|
||||||
assert doc2[1].head.text == "jests"
|
|
||||||
assert doc2[1].dep_ == "prep"
|
|
||||||
assert doc2[2].head.text == "at"
|
|
||||||
assert doc2[2].dep_ == "pobj"
|
|
||||||
assert doc2[3].head.text == "jests"
|
|
||||||
assert doc2[3].dep_ == "punct"
|
|
||||||
assert (
|
|
||||||
doc2[4].head.text == "They"
|
|
||||||
) # head set to itself, being the new artificial root (in sentence 2)
|
|
||||||
assert doc2[4].dep_ == "dep"
|
|
||||||
assert (
|
|
||||||
doc2[4].head.text == "They"
|
|
||||||
) # head set to the new artificial head (in sentence 2)
|
|
||||||
assert doc2[4].dep_ == "dep"
|
|
||||||
|
|
||||||
# We should still have 2 sentences
|
|
||||||
sents = list(doc2.sents)
|
|
||||||
assert len(sents) == 2
|
|
||||||
assert sents[0].text == "jests at scars ."
|
|
||||||
assert sents[1].text == "They never"
|
|
|
@ -1,19 +0,0 @@
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue3972(en_vocab):
|
|
||||||
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
|
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab)
|
|
||||||
matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
|
|
||||||
matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
|
|
||||||
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
|
|
||||||
matches = matcher(doc)
|
|
||||||
|
|
||||||
assert len(matches) == 2
|
|
||||||
|
|
||||||
# We should have a match for each of the two rules
|
|
||||||
found_ids = [en_vocab.strings[ent_id] for (ent_id, _, _) in matches]
|
|
||||||
assert "A" in found_ids
|
|
||||||
assert "B" in found_ids
|
|
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
469
spacy/tests/regression/test_issue4001-4500.py
Normal file
|
@ -0,0 +1,469 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.pipeline import EntityRuler, EntityRecognizer, Pipe
|
||||||
|
from spacy.pipeline.defaults import default_ner
|
||||||
|
from spacy.matcher import PhraseMatcher, Matcher
|
||||||
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
|
from spacy.gold import Example, Corpus
|
||||||
|
from spacy.gold.converters import json2docs
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.util import minibatch, ensure_path, load_model
|
||||||
|
from spacy.util import compile_prefix_regex, compile_suffix_regex, compile_infix_regex
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.lang.el import Greek
|
||||||
|
from spacy.language import Language
|
||||||
|
import spacy
|
||||||
|
from thinc.api import compounding
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4002(en_vocab):
|
||||||
|
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
||||||
|
"""
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
|
pattern1 = Doc(en_vocab, words=["c", "d"])
|
||||||
|
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
||||||
|
matcher.add("TEST", [pattern1])
|
||||||
|
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
||||||
|
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
||||||
|
pattern2 = Doc(en_vocab, words=["1", "2"])
|
||||||
|
pattern2[0].norm_ = "c"
|
||||||
|
pattern2[1].norm_ = "d"
|
||||||
|
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
||||||
|
matcher.add("TEST", [pattern2])
|
||||||
|
matches = matcher(doc)
|
||||||
|
assert len(matches) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4030():
|
||||||
|
""" Test whether textcat works fine with empty doc """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = [
|
||||||
|
"This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff",
|
||||||
|
]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
# preparing the data
|
||||||
|
train_data = []
|
||||||
|
for text, train_instance in zip(x_train, y_train):
|
||||||
|
cat_dict = {label: label == train_instance for label in unique_classes}
|
||||||
|
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
||||||
|
# add a text categorizer component
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
||||||
|
)
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
# training the network
|
||||||
|
with nlp.select_pipes(enable="textcat"):
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(
|
||||||
|
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
||||||
|
)
|
||||||
|
# processing of an empty doc should result in 0.0 for all categories
|
||||||
|
doc = nlp("")
|
||||||
|
assert doc.cats["offensive"] == 0.0
|
||||||
|
assert doc.cats["inoffensive"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4042():
|
||||||
|
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
|
||||||
|
# Add entity ruler
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [
|
||||||
|
{"label": "MY_ORG", "pattern": "Apple"},
|
||||||
|
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||||
|
]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||||
|
doc1 = nlp("What do you think about Apple ?")
|
||||||
|
assert doc1.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
nlp.to_disk(output_dir)
|
||||||
|
|
||||||
|
nlp2 = load_model(output_dir)
|
||||||
|
doc2 = nlp2("What do you think about Apple ?")
|
||||||
|
assert doc2.ents[0].label_ == "MY_ORG"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4042_bug2():
|
||||||
|
"""
|
||||||
|
Test that serialization of an NER works fine when new labels were added.
|
||||||
|
This is the second bug of two bugs underlying the issue 4042.
|
||||||
|
"""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab = nlp1.vocab
|
||||||
|
|
||||||
|
# add ner pipe
|
||||||
|
ner1 = nlp1.create_pipe("ner")
|
||||||
|
ner1.add_label("SOME_LABEL")
|
||||||
|
nlp1.add_pipe(ner1)
|
||||||
|
nlp1.begin_training()
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc1 = nlp1("What do you think about Apple ?")
|
||||||
|
assert len(ner1.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||||
|
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# reapply the NER - at this point it should resize itself
|
||||||
|
ner1(doc1)
|
||||||
|
assert len(ner1.labels) == 2
|
||||||
|
assert "SOME_LABEL" in ner1.labels
|
||||||
|
assert "MY_ORG" in ner1.labels
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
# assert IO goes fine
|
||||||
|
output_dir = ensure_path(d)
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir()
|
||||||
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
||||||
|
ner2.from_disk(output_dir)
|
||||||
|
assert len(ner2.labels) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4054(en_vocab):
|
||||||
|
"""Test that a new blank model can be made with a vocab from file,
|
||||||
|
and that serialization does not drop the language at any point."""
|
||||||
|
nlp1 = English()
|
||||||
|
vocab1 = nlp1.vocab
|
||||||
|
with make_tempdir() as d:
|
||||||
|
vocab_dir = ensure_path(d / "vocab")
|
||||||
|
if not vocab_dir.exists():
|
||||||
|
vocab_dir.mkdir()
|
||||||
|
vocab1.to_disk(vocab_dir)
|
||||||
|
vocab2 = Vocab().from_disk(vocab_dir)
|
||||||
|
print("lang", vocab2.lang)
|
||||||
|
nlp2 = spacy.blank("en", vocab=vocab2)
|
||||||
|
nlp_dir = ensure_path(d / "nlp")
|
||||||
|
if not nlp_dir.exists():
|
||||||
|
nlp_dir.mkdir()
|
||||||
|
nlp2.to_disk(nlp_dir)
|
||||||
|
nlp3 = load_model(nlp_dir)
|
||||||
|
assert nlp3.lang == "en"
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4120(en_vocab):
|
||||||
|
"""Test that matches without a final {OP: ?} token are returned."""
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
||||||
|
doc1 = Doc(en_vocab, words=["a"])
|
||||||
|
assert len(matcher(doc1)) == 1 # works
|
||||||
|
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
||||||
|
assert len(matcher(doc2)) == 2 # fixed
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
||||||
|
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
|
assert len(matcher(doc3)) == 2 # works
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
||||||
|
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
||||||
|
assert len(matcher(doc4)) == 3 # fixed
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4133(en_vocab):
|
||||||
|
nlp = English()
|
||||||
|
vocab_bytes = nlp.vocab.to_bytes()
|
||||||
|
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||||
|
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
token.pos_ = pos[i]
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
doc_bytes = doc.to_bytes()
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab = vocab.from_bytes(vocab_bytes)
|
||||||
|
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||||
|
actual = []
|
||||||
|
for token in doc:
|
||||||
|
actual.append(token.pos_)
|
||||||
|
assert actual == pos
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4190():
|
||||||
|
def customize_tokenizer(nlp):
|
||||||
|
prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
|
||||||
|
suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
|
||||||
|
infix_re = compile_infix_regex(nlp.Defaults.infixes)
|
||||||
|
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
||||||
|
exceptions = {
|
||||||
|
k: v
|
||||||
|
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
||||||
|
if not (len(k) == 2 and k[1] == ".")
|
||||||
|
}
|
||||||
|
new_tokenizer = Tokenizer(
|
||||||
|
nlp.vocab,
|
||||||
|
exceptions,
|
||||||
|
prefix_search=prefix_re.search,
|
||||||
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=nlp.tokenizer.token_match,
|
||||||
|
)
|
||||||
|
nlp.tokenizer = new_tokenizer
|
||||||
|
|
||||||
|
test_string = "Test c."
|
||||||
|
# Load default language
|
||||||
|
nlp_1 = English()
|
||||||
|
doc_1a = nlp_1(test_string)
|
||||||
|
result_1a = [token.text for token in doc_1a] # noqa: F841
|
||||||
|
# Modify tokenizer
|
||||||
|
customize_tokenizer(nlp_1)
|
||||||
|
doc_1b = nlp_1(test_string)
|
||||||
|
result_1b = [token.text for token in doc_1b]
|
||||||
|
# Save and Reload
|
||||||
|
with make_tempdir() as model_dir:
|
||||||
|
nlp_1.to_disk(model_dir)
|
||||||
|
nlp_2 = load_model(model_dir)
|
||||||
|
# This should be the modified tokenizer
|
||||||
|
doc_2 = nlp_2(test_string)
|
||||||
|
result_2 = [token.text for token in doc_2]
|
||||||
|
assert result_1b == result_2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4267():
|
||||||
|
""" Test that running an entity_ruler after ner gives consistent results"""
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("PEOPLE")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
# assert that we have correct IOB annotations
|
||||||
|
doc1 = nlp("hi")
|
||||||
|
assert doc1.is_nered
|
||||||
|
for token in doc1:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
# add entity ruler and run again
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
assert "entity_ruler" in nlp.pipe_names
|
||||||
|
assert "ner" in nlp.pipe_names
|
||||||
|
# assert that we still have correct IOB annotations
|
||||||
|
doc2 = nlp("hi")
|
||||||
|
assert doc2.is_nered
|
||||||
|
for token in doc2:
|
||||||
|
assert token.ent_iob == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4272():
|
||||||
|
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
||||||
|
are available."""
|
||||||
|
nlp = Greek()
|
||||||
|
doc = nlp("Χθες")
|
||||||
|
assert doc[0].lemma_
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_predictions():
|
||||||
|
class DummyPipe(Pipe):
|
||||||
|
def __init__(self):
|
||||||
|
self.model = "dummy_model"
|
||||||
|
|
||||||
|
def predict(self, docs):
|
||||||
|
return ([1, 2, 3], [4, 5, 6])
|
||||||
|
|
||||||
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
|
return docs
|
||||||
|
|
||||||
|
nlp = Language()
|
||||||
|
doc = nlp.make_doc("foo")
|
||||||
|
dummy_pipe = DummyPipe()
|
||||||
|
dummy_pipe(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="removed Beam stuff during the Example/GoldParse refactor")
|
||||||
|
def test_issue4313():
|
||||||
|
""" This should not crash or exit with some strange error code """
|
||||||
|
beam_width = 16
|
||||||
|
beam_density = 0.0001
|
||||||
|
nlp = English()
|
||||||
|
config = {
|
||||||
|
"learn_tokens": False,
|
||||||
|
"min_action_freq": 30,
|
||||||
|
"beam_width": 1,
|
||||||
|
"beam_update_prob": 1.0,
|
||||||
|
}
|
||||||
|
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
||||||
|
ner.add_label("SOME_LABEL")
|
||||||
|
ner.begin_training([])
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
|
||||||
|
# add a new label to the doc
|
||||||
|
doc = nlp("What do you think about Apple ?")
|
||||||
|
assert len(ner.labels) == 1
|
||||||
|
assert "SOME_LABEL" in ner.labels
|
||||||
|
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||||
|
doc.ents = list(doc.ents) + [apple_ent]
|
||||||
|
|
||||||
|
# ensure the beam_parse still works with the new label
|
||||||
|
docs = [doc]
|
||||||
|
beams = nlp.entity.beam_parse(
|
||||||
|
docs, beam_width=beam_width, beam_density=beam_density
|
||||||
|
)
|
||||||
|
|
||||||
|
for doc, beam in zip(docs, beams):
|
||||||
|
entity_scores = defaultdict(float)
|
||||||
|
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||||
|
for start, end, label in ents:
|
||||||
|
entity_scores[(start, end, label)] += score
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4348():
|
||||||
|
"""Test that training the tagger with empty data, doesn't throw errors"""
|
||||||
|
nlp = English()
|
||||||
|
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
||||||
|
TRAIN_DATA = [example, example]
|
||||||
|
tagger = nlp.create_pipe("tagger")
|
||||||
|
nlp.add_pipe(tagger)
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(5):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4367():
|
||||||
|
"""Test that docbin init goes well"""
|
||||||
|
DocBin()
|
||||||
|
DocBin(attrs=["LEMMA"])
|
||||||
|
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4373():
|
||||||
|
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
||||||
|
matcher = Matcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
matcher = PhraseMatcher(Vocab())
|
||||||
|
assert isinstance(matcher.vocab, Vocab)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4402():
|
||||||
|
json_data = {
|
||||||
|
"id": 0,
|
||||||
|
"paragraphs": [
|
||||||
|
{
|
||||||
|
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
||||||
|
"sentences": [
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 0, "orth": "How", "ner": "O"},
|
||||||
|
{"id": 1, "orth": "should", "ner": "O"},
|
||||||
|
{"id": 2, "orth": "I", "ner": "O"},
|
||||||
|
{"id": 3, "orth": "cook", "ner": "O"},
|
||||||
|
{"id": 4, "orth": "bacon", "ner": "O"},
|
||||||
|
{"id": 5, "orth": "in", "ner": "O"},
|
||||||
|
{"id": 6, "orth": "an", "ner": "O"},
|
||||||
|
{"id": 7, "orth": "oven", "ner": "O"},
|
||||||
|
{"id": 8, "orth": "?", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 9, "orth": "\n", "ner": "O"},
|
||||||
|
{"id": 10, "orth": "I", "ner": "O"},
|
||||||
|
{"id": 11, "orth": "'ve", "ner": "O"},
|
||||||
|
{"id": 12, "orth": "heard", "ner": "O"},
|
||||||
|
{"id": 13, "orth": "of", "ner": "O"},
|
||||||
|
{"id": 14, "orth": "people", "ner": "O"},
|
||||||
|
{"id": 15, "orth": "cooking", "ner": "O"},
|
||||||
|
{"id": 16, "orth": "bacon", "ner": "O"},
|
||||||
|
{"id": 17, "orth": "in", "ner": "O"},
|
||||||
|
{"id": 18, "orth": "an", "ner": "O"},
|
||||||
|
{"id": 19, "orth": "oven", "ner": "O"},
|
||||||
|
{"id": 20, "orth": ".", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"cats": [
|
||||||
|
{"label": "baking", "value": 1.0},
|
||||||
|
{"label": "not_baking", "value": 0.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"raw": "What is the difference between white and brown eggs?\n",
|
||||||
|
"sentences": [
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 0, "orth": "What", "ner": "O"},
|
||||||
|
{"id": 1, "orth": "is", "ner": "O"},
|
||||||
|
{"id": 2, "orth": "the", "ner": "O"},
|
||||||
|
{"id": 3, "orth": "difference", "ner": "O"},
|
||||||
|
{"id": 4, "orth": "between", "ner": "O"},
|
||||||
|
{"id": 5, "orth": "white", "ner": "O"},
|
||||||
|
{"id": 6, "orth": "and", "ner": "O"},
|
||||||
|
{"id": 7, "orth": "brown", "ner": "O"},
|
||||||
|
{"id": 8, "orth": "eggs", "ner": "O"},
|
||||||
|
{"id": 9, "orth": "?", "ner": "O"},
|
||||||
|
],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
||||||
|
],
|
||||||
|
"cats": [
|
||||||
|
{"label": "baking", "value": 0.0},
|
||||||
|
{"label": "not_baking", "value": 1.0},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
nlp = English()
|
||||||
|
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
output_file = tmpdir / "test4402.spacy"
|
||||||
|
docs = json2docs([json_data])
|
||||||
|
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||||
|
with output_file.open("wb") as file_:
|
||||||
|
file_.write(data)
|
||||||
|
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
||||||
|
|
||||||
|
train_data = list(corpus.train_dataset(nlp))
|
||||||
|
assert len(train_data) == 2
|
||||||
|
|
||||||
|
split_train_data = []
|
||||||
|
for eg in train_data:
|
||||||
|
split_train_data.extend(eg.split_sents())
|
||||||
|
assert len(split_train_data) == 4
|
|
@ -1,23 +0,0 @@
|
||||||
from spacy.matcher import PhraseMatcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4002(en_vocab):
|
|
||||||
"""Test that the PhraseMatcher can match on overwritten NORM attributes.
|
|
||||||
"""
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
||||||
pattern1 = Doc(en_vocab, words=["c", "d"])
|
|
||||||
assert [t.norm_ for t in pattern1] == ["c", "d"]
|
|
||||||
matcher.add("TEST", [pattern1])
|
|
||||||
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
|
|
||||||
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
||||||
matcher = PhraseMatcher(en_vocab, attr="NORM")
|
|
||||||
pattern2 = Doc(en_vocab, words=["1", "2"])
|
|
||||||
pattern2[0].norm_ = "c"
|
|
||||||
pattern2[1].norm_ = "d"
|
|
||||||
assert [t.norm_ for t in pattern2] == ["c", "d"]
|
|
||||||
matcher.add("TEST", [pattern2])
|
|
||||||
matches = matcher(doc)
|
|
||||||
assert len(matches) == 1
|
|
|
@ -1,50 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4030():
|
|
||||||
""" Test whether textcat works fine with empty doc """
|
|
||||||
unique_classes = ["offensive", "inoffensive"]
|
|
||||||
x_train = [
|
|
||||||
"This is an offensive text",
|
|
||||||
"This is the second offensive text",
|
|
||||||
"inoff",
|
|
||||||
]
|
|
||||||
y_train = ["offensive", "offensive", "inoffensive"]
|
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
|
||||||
|
|
||||||
# preparing the data
|
|
||||||
train_data = []
|
|
||||||
for text, train_instance in zip(x_train, y_train):
|
|
||||||
cat_dict = {label: label == train_instance for label in unique_classes}
|
|
||||||
train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
|
|
||||||
|
|
||||||
# add a text categorizer component
|
|
||||||
textcat = nlp.create_pipe(
|
|
||||||
"textcat",
|
|
||||||
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
|
||||||
)
|
|
||||||
|
|
||||||
for label in unique_classes:
|
|
||||||
textcat.add_label(label)
|
|
||||||
nlp.add_pipe(textcat, last=True)
|
|
||||||
|
|
||||||
# training the network
|
|
||||||
with nlp.select_pipes(enable="textcat"):
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(3):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(
|
|
||||||
examples=batch, sgd=optimizer, drop=0.1, losses=losses,
|
|
||||||
)
|
|
||||||
|
|
||||||
# processing of an empty doc should result in 0.0 for all categories
|
|
||||||
doc = nlp("")
|
|
||||||
assert doc.cats["offensive"] == 0.0
|
|
||||||
assert doc.cats["inoffensive"] == 0.0
|
|
|
@ -1,85 +0,0 @@
|
||||||
import spacy
|
|
||||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4042():
|
|
||||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
# add ner pipe
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_label("SOME_LABEL")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
# Add entity ruler
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [
|
|
||||||
{"label": "MY_ORG", "pattern": "Apple"},
|
|
||||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
|
||||||
]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
|
||||||
doc1 = nlp("What do you think about Apple ?")
|
|
||||||
assert doc1.ents[0].label_ == "MY_ORG"
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
output_dir = ensure_path(d)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
nlp.to_disk(output_dir)
|
|
||||||
|
|
||||||
nlp2 = spacy.load(output_dir)
|
|
||||||
doc2 = nlp2("What do you think about Apple ?")
|
|
||||||
assert doc2.ents[0].label_ == "MY_ORG"
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4042_bug2():
|
|
||||||
"""
|
|
||||||
Test that serialization of an NER works fine when new labels were added.
|
|
||||||
This is the second bug of two bugs underlying the issue 4042.
|
|
||||||
"""
|
|
||||||
nlp1 = English()
|
|
||||||
vocab = nlp1.vocab
|
|
||||||
|
|
||||||
# add ner pipe
|
|
||||||
ner1 = nlp1.create_pipe("ner")
|
|
||||||
ner1.add_label("SOME_LABEL")
|
|
||||||
nlp1.add_pipe(ner1)
|
|
||||||
nlp1.begin_training()
|
|
||||||
|
|
||||||
# add a new label to the doc
|
|
||||||
doc1 = nlp1("What do you think about Apple ?")
|
|
||||||
assert len(ner1.labels) == 1
|
|
||||||
assert "SOME_LABEL" in ner1.labels
|
|
||||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
|
||||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
|
||||||
|
|
||||||
# reapply the NER - at this point it should resize itself
|
|
||||||
ner1(doc1)
|
|
||||||
assert len(ner1.labels) == 2
|
|
||||||
assert "SOME_LABEL" in ner1.labels
|
|
||||||
assert "MY_ORG" in ner1.labels
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
# assert IO goes fine
|
|
||||||
output_dir = ensure_path(d)
|
|
||||||
if not output_dir.exists():
|
|
||||||
output_dir.mkdir()
|
|
||||||
ner1.to_disk(output_dir)
|
|
||||||
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
ner2 = EntityRecognizer(vocab, default_ner(), **config)
|
|
||||||
ner2.from_disk(output_dir)
|
|
||||||
assert len(ner2.labels) == 2
|
|
|
@ -1,30 +0,0 @@
|
||||||
from spacy.vocab import Vocab
|
|
||||||
import spacy
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4054(en_vocab):
|
|
||||||
"""Test that a new blank model can be made with a vocab from file,
|
|
||||||
and that serialization does not drop the language at any point."""
|
|
||||||
nlp1 = English()
|
|
||||||
vocab1 = nlp1.vocab
|
|
||||||
|
|
||||||
with make_tempdir() as d:
|
|
||||||
vocab_dir = ensure_path(d / "vocab")
|
|
||||||
if not vocab_dir.exists():
|
|
||||||
vocab_dir.mkdir()
|
|
||||||
vocab1.to_disk(vocab_dir)
|
|
||||||
|
|
||||||
vocab2 = Vocab().from_disk(vocab_dir)
|
|
||||||
print("lang", vocab2.lang)
|
|
||||||
nlp2 = spacy.blank("en", vocab=vocab2)
|
|
||||||
|
|
||||||
nlp_dir = ensure_path(d / "nlp")
|
|
||||||
if not nlp_dir.exists():
|
|
||||||
nlp_dir.mkdir()
|
|
||||||
nlp2.to_disk(nlp_dir)
|
|
||||||
nlp3 = spacy.load(nlp_dir)
|
|
||||||
assert nlp3.lang == "en"
|
|
|
@ -1,23 +0,0 @@
|
||||||
from spacy.matcher import Matcher
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4120(en_vocab):
|
|
||||||
"""Test that matches without a final {OP: ?} token are returned."""
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
|
|
||||||
doc1 = Doc(en_vocab, words=["a"])
|
|
||||||
assert len(matcher(doc1)) == 1 # works
|
|
||||||
|
|
||||||
doc2 = Doc(en_vocab, words=["a", "b", "c"])
|
|
||||||
assert len(matcher(doc2)) == 2 # fixed
|
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
|
|
||||||
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
|
||||||
assert len(matcher(doc3)) == 2 # works
|
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
|
||||||
matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
|
|
||||||
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
|
|
||||||
assert len(matcher(doc4)) == 3 # fixed
|
|
|
@ -1,28 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4133(en_vocab):
|
|
||||||
nlp = English()
|
|
||||||
vocab_bytes = nlp.vocab.to_bytes()
|
|
||||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
|
||||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
|
||||||
doc = Doc(en_vocab, words=words)
|
|
||||||
for i, token in enumerate(doc):
|
|
||||||
token.pos_ = pos[i]
|
|
||||||
|
|
||||||
# usually this is already True when starting from proper models instead of blank English
|
|
||||||
doc.is_tagged = True
|
|
||||||
|
|
||||||
doc_bytes = doc.to_bytes()
|
|
||||||
|
|
||||||
vocab = Vocab()
|
|
||||||
vocab = vocab.from_bytes(vocab_bytes)
|
|
||||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
|
||||||
|
|
||||||
actual = []
|
|
||||||
for token in doc:
|
|
||||||
actual.append(token.pos_)
|
|
||||||
|
|
||||||
assert actual == pos
|
|
|
@ -1,46 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokenizer import Tokenizer
|
|
||||||
from spacy import util
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4190():
|
|
||||||
test_string = "Test c."
|
|
||||||
# Load default language
|
|
||||||
nlp_1 = English()
|
|
||||||
doc_1a = nlp_1(test_string)
|
|
||||||
result_1a = [token.text for token in doc_1a] # noqa: F841
|
|
||||||
# Modify tokenizer
|
|
||||||
customize_tokenizer(nlp_1)
|
|
||||||
doc_1b = nlp_1(test_string)
|
|
||||||
result_1b = [token.text for token in doc_1b]
|
|
||||||
# Save and Reload
|
|
||||||
with make_tempdir() as model_dir:
|
|
||||||
nlp_1.to_disk(model_dir)
|
|
||||||
nlp_2 = util.load_model(model_dir)
|
|
||||||
# This should be the modified tokenizer
|
|
||||||
doc_2 = nlp_2(test_string)
|
|
||||||
result_2 = [token.text for token in doc_2]
|
|
||||||
assert result_1b == result_2
|
|
||||||
|
|
||||||
|
|
||||||
def customize_tokenizer(nlp):
|
|
||||||
prefix_re = util.compile_prefix_regex(nlp.Defaults.prefixes)
|
|
||||||
suffix_re = util.compile_suffix_regex(nlp.Defaults.suffixes)
|
|
||||||
infix_re = util.compile_infix_regex(nlp.Defaults.infixes)
|
|
||||||
# Remove all exceptions where a single letter is followed by a period (e.g. 'h.')
|
|
||||||
exceptions = {
|
|
||||||
k: v
|
|
||||||
for k, v in dict(nlp.Defaults.tokenizer_exceptions).items()
|
|
||||||
if not (len(k) == 2 and k[1] == ".")
|
|
||||||
}
|
|
||||||
new_tokenizer = Tokenizer(
|
|
||||||
nlp.vocab,
|
|
||||||
exceptions,
|
|
||||||
prefix_search=prefix_re.search,
|
|
||||||
suffix_search=suffix_re.search,
|
|
||||||
infix_finditer=infix_re.finditer,
|
|
||||||
token_match=nlp.tokenizer.token_match,
|
|
||||||
)
|
|
||||||
nlp.tokenizer = new_tokenizer
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4267():
|
|
||||||
""" Test that running an entity_ruler after ner gives consistent results"""
|
|
||||||
nlp = English()
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
ner.add_label("PEOPLE")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
|
|
||||||
assert "ner" in nlp.pipe_names
|
|
||||||
|
|
||||||
# assert that we have correct IOB annotations
|
|
||||||
doc1 = nlp("hi")
|
|
||||||
assert doc1.is_nered
|
|
||||||
for token in doc1:
|
|
||||||
assert token.ent_iob == 2
|
|
||||||
|
|
||||||
# add entity ruler and run again
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]
|
|
||||||
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
assert "entity_ruler" in nlp.pipe_names
|
|
||||||
assert "ner" in nlp.pipe_names
|
|
||||||
|
|
||||||
# assert that we still have correct IOB annotations
|
|
||||||
doc2 = nlp("hi")
|
|
||||||
assert doc2.is_nered
|
|
||||||
for token in doc2:
|
|
||||||
assert token.ent_iob == 2
|
|
|
@ -1,9 +0,0 @@
|
||||||
from spacy.lang.el import Greek
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4272():
|
|
||||||
"""Test that lookup table can be accessed from Token.lemma if no POS tags
|
|
||||||
are available."""
|
|
||||||
nlp = Greek()
|
|
||||||
doc = nlp("Χθες")
|
|
||||||
assert doc[0].lemma_
|
|
|
@ -1,25 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.language import Language
|
|
||||||
from spacy.pipeline import Pipe
|
|
||||||
|
|
||||||
|
|
||||||
class DummyPipe(Pipe):
|
|
||||||
def __init__(self):
|
|
||||||
self.model = "dummy_model"
|
|
||||||
|
|
||||||
def predict(self, docs):
|
|
||||||
return ([1, 2, 3], [4, 5, 6])
|
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
|
||||||
return docs
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def nlp():
|
|
||||||
return Language()
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_predictions(nlp):
|
|
||||||
doc = nlp.make_doc("foo")
|
|
||||||
dummy_pipe = DummyPipe()
|
|
||||||
dummy_pipe(doc)
|
|
|
@ -1,47 +0,0 @@
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
|
||||||
from spacy.pipeline import EntityRecognizer
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span
|
|
||||||
|
|
||||||
|
|
||||||
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
|
||||||
@pytest.mark.skip
|
|
||||||
def test_issue4313():
|
|
||||||
""" This should not crash or exit with some strange error code """
|
|
||||||
beam_width = 16
|
|
||||||
beam_density = 0.0001
|
|
||||||
nlp = English()
|
|
||||||
config = {
|
|
||||||
"learn_tokens": False,
|
|
||||||
"min_action_freq": 30,
|
|
||||||
"beam_width": 1,
|
|
||||||
"beam_update_prob": 1.0,
|
|
||||||
}
|
|
||||||
ner = EntityRecognizer(nlp.vocab, default_ner(), **config)
|
|
||||||
ner.add_label("SOME_LABEL")
|
|
||||||
ner.begin_training([])
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
|
|
||||||
# add a new label to the doc
|
|
||||||
doc = nlp("What do you think about Apple ?")
|
|
||||||
assert len(ner.labels) == 1
|
|
||||||
assert "SOME_LABEL" in ner.labels
|
|
||||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
|
||||||
doc.ents = list(doc.ents) + [apple_ent]
|
|
||||||
|
|
||||||
# ensure the beam_parse still works with the new label
|
|
||||||
docs = [doc]
|
|
||||||
beams = nlp.entity.beam_parse(
|
|
||||||
docs, beam_width=beam_width, beam_density=beam_density
|
|
||||||
)
|
|
||||||
|
|
||||||
for doc, beam in zip(docs, beams):
|
|
||||||
entity_scores = defaultdict(float)
|
|
||||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
|
||||||
for start, end, label in ents:
|
|
||||||
entity_scores[(start, end, label)] += score
|
|
|
@ -1,24 +0,0 @@
|
||||||
from spacy.gold import Example
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.util import minibatch
|
|
||||||
from thinc.api import compounding
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
|
||||||
def test_issue4348():
|
|
||||||
"""Test that training the tagger with empty data, doesn't throw errors"""
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
example = Example.from_dict(nlp.make_doc(""), {"tags": []})
|
|
||||||
TRAIN_DATA = [example, example]
|
|
||||||
|
|
||||||
tagger = nlp.create_pipe("tagger")
|
|
||||||
nlp.add_pipe(tagger)
|
|
||||||
|
|
||||||
optimizer = nlp.begin_training()
|
|
||||||
for i in range(5):
|
|
||||||
losses = {}
|
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
|
||||||
for batch in batches:
|
|
||||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.tokens import DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4367():
|
|
||||||
"""Test that docbin init goes well"""
|
|
||||||
DocBin()
|
|
||||||
DocBin(attrs=["LEMMA"])
|
|
||||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
|
|
@ -1,10 +0,0 @@
|
||||||
from spacy.matcher import Matcher, PhraseMatcher
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4373():
|
|
||||||
"""Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
|
|
||||||
matcher = Matcher(Vocab())
|
|
||||||
assert isinstance(matcher.vocab, Vocab)
|
|
||||||
matcher = PhraseMatcher(Vocab())
|
|
||||||
assert isinstance(matcher.vocab, Vocab)
|
|
|
@ -1,98 +0,0 @@
|
||||||
from spacy.gold import Corpus
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
from ...gold.converters import json2docs
|
|
||||||
from ...tokens import DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4402():
|
|
||||||
nlp = English()
|
|
||||||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
output_file = tmpdir / "test4402.spacy"
|
|
||||||
docs = json2docs([json_data])
|
|
||||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
|
||||||
with output_file.open("wb") as file_:
|
|
||||||
file_.write(data)
|
|
||||||
corpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
|
|
||||||
|
|
||||||
train_data = list(corpus.train_dataset(nlp))
|
|
||||||
assert len(train_data) == 2
|
|
||||||
|
|
||||||
split_train_data = []
|
|
||||||
for eg in train_data:
|
|
||||||
split_train_data.extend(eg.split_sents())
|
|
||||||
assert len(split_train_data) == 4
|
|
||||||
|
|
||||||
|
|
||||||
json_data = {
|
|
||||||
"id": 0,
|
|
||||||
"paragraphs": [
|
|
||||||
{
|
|
||||||
"raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 0, "orth": "How", "ner": "O"},
|
|
||||||
{"id": 1, "orth": "should", "ner": "O"},
|
|
||||||
{"id": 2, "orth": "I", "ner": "O"},
|
|
||||||
{"id": 3, "orth": "cook", "ner": "O"},
|
|
||||||
{"id": 4, "orth": "bacon", "ner": "O"},
|
|
||||||
{"id": 5, "orth": "in", "ner": "O"},
|
|
||||||
{"id": 6, "orth": "an", "ner": "O"},
|
|
||||||
{"id": 7, "orth": "oven", "ner": "O"},
|
|
||||||
{"id": 8, "orth": "?", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 9, "orth": "\n", "ner": "O"},
|
|
||||||
{"id": 10, "orth": "I", "ner": "O"},
|
|
||||||
{"id": 11, "orth": "'ve", "ner": "O"},
|
|
||||||
{"id": 12, "orth": "heard", "ner": "O"},
|
|
||||||
{"id": 13, "orth": "of", "ner": "O"},
|
|
||||||
{"id": 14, "orth": "people", "ner": "O"},
|
|
||||||
{"id": 15, "orth": "cooking", "ner": "O"},
|
|
||||||
{"id": 16, "orth": "bacon", "ner": "O"},
|
|
||||||
{"id": 17, "orth": "in", "ner": "O"},
|
|
||||||
{"id": 18, "orth": "an", "ner": "O"},
|
|
||||||
{"id": 19, "orth": "oven", "ner": "O"},
|
|
||||||
{"id": 20, "orth": ".", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"cats": [
|
|
||||||
{"label": "baking", "value": 1.0},
|
|
||||||
{"label": "not_baking", "value": 0.0},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"raw": "What is the difference between white and brown eggs?\n",
|
|
||||||
"sentences": [
|
|
||||||
{
|
|
||||||
"tokens": [
|
|
||||||
{"id": 0, "orth": "What", "ner": "O"},
|
|
||||||
{"id": 1, "orth": "is", "ner": "O"},
|
|
||||||
{"id": 2, "orth": "the", "ner": "O"},
|
|
||||||
{"id": 3, "orth": "difference", "ner": "O"},
|
|
||||||
{"id": 4, "orth": "between", "ner": "O"},
|
|
||||||
{"id": 5, "orth": "white", "ner": "O"},
|
|
||||||
{"id": 6, "orth": "and", "ner": "O"},
|
|
||||||
{"id": 7, "orth": "brown", "ner": "O"},
|
|
||||||
{"id": 8, "orth": "eggs", "ner": "O"},
|
|
||||||
{"id": 9, "orth": "?", "ner": "O"},
|
|
||||||
],
|
|
||||||
"brackets": [],
|
|
||||||
},
|
|
||||||
{"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
|
|
||||||
],
|
|
||||||
"cats": [
|
|
||||||
{"label": "baking", "value": 0.0},
|
|
||||||
{"label": "not_baking", "value": 1.0},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
288
spacy/tests/regression/test_issue4501-5000.py
Normal file
|
@ -0,0 +1,288 @@
|
||||||
|
import pytest
|
||||||
|
from mock import Mock
|
||||||
|
from spacy.pipeline import EntityRuler
|
||||||
|
from spacy.matcher import DependencyMatcher
|
||||||
|
from spacy.tokens import Doc, Span, DocBin
|
||||||
|
from spacy.gold import Example
|
||||||
|
from spacy.gold.converters.conllu2docs import conllu2docs
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.kb import KnowledgeBase
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.language import Language
|
||||||
|
from spacy.util import ensure_path, load_model_from_path
|
||||||
|
import numpy
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
from ..util import get_doc, make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4528(en_vocab):
|
||||||
|
"""Test that user_data is correctly serialized in DocBin."""
|
||||||
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
|
doc.user_data["foo"] = "bar"
|
||||||
|
# This is how extension attribute values are stored in the user data
|
||||||
|
doc.user_data[("._.", "foo", None, None)] = "bar"
|
||||||
|
doc_bin = DocBin(store_user_data=True)
|
||||||
|
doc_bin.add(doc)
|
||||||
|
doc_bin_bytes = doc_bin.to_bytes()
|
||||||
|
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
||||||
|
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
||||||
|
assert new_doc.user_data["foo"] == "bar"
|
||||||
|
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||||
|
)
|
||||||
|
def test_gold_misaligned(en_tokenizer, text, words):
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
Example.from_dict(doc, {"words": words})
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4590(en_vocab):
|
||||||
|
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
||||||
|
pattern = [
|
||||||
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
on_match = Mock()
|
||||||
|
matcher = DependencyMatcher(en_vocab)
|
||||||
|
matcher.add("pattern", on_match, pattern)
|
||||||
|
text = "The quick brown fox jumped over the lazy fox"
|
||||||
|
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
|
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
||||||
|
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
||||||
|
matches = matcher(doc)
|
||||||
|
on_match_args = on_match.call_args
|
||||||
|
assert on_match_args[0][3] == matches
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4651_with_phrase_matcher_attr():
|
||||||
|
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||||
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||||
|
specified.
|
||||||
|
"""
|
||||||
|
text = "Spacy is a python library for nlp"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
||||||
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||||
|
nlp_reloaded = English()
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "entityruler"
|
||||||
|
ruler.to_disk(file_path)
|
||||||
|
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||||
|
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||||
|
doc_reloaded = nlp_reloaded(text)
|
||||||
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||||
|
assert res == res_reloaded
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4651_without_phrase_matcher_attr():
|
||||||
|
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
||||||
|
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
||||||
|
not specified.
|
||||||
|
"""
|
||||||
|
text = "Spacy is a python library for nlp"
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(nlp)
|
||||||
|
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
doc = nlp(text)
|
||||||
|
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
||||||
|
nlp_reloaded = English()
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "entityruler"
|
||||||
|
ruler.to_disk(file_path)
|
||||||
|
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
||||||
|
nlp_reloaded.add_pipe(ruler_reloaded)
|
||||||
|
doc_reloaded = nlp_reloaded(text)
|
||||||
|
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
||||||
|
assert res == res_reloaded
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4665():
|
||||||
|
"""
|
||||||
|
conllu2json should not raise an exception if the HEAD column contains an
|
||||||
|
underscore
|
||||||
|
"""
|
||||||
|
input_data = """
|
||||||
|
1 [ _ PUNCT -LRB- _ _ punct _ _
|
||||||
|
2 This _ DET DT _ _ det _ _
|
||||||
|
3 killing _ NOUN NN _ _ nsubj _ _
|
||||||
|
4 of _ ADP IN _ _ case _ _
|
||||||
|
5 a _ DET DT _ _ det _ _
|
||||||
|
6 respected _ ADJ JJ _ _ amod _ _
|
||||||
|
7 cleric _ NOUN NN _ _ nmod _ _
|
||||||
|
8 will _ AUX MD _ _ aux _ _
|
||||||
|
9 be _ AUX VB _ _ aux _ _
|
||||||
|
10 causing _ VERB VBG _ _ root _ _
|
||||||
|
11 us _ PRON PRP _ _ iobj _ _
|
||||||
|
12 trouble _ NOUN NN _ _ dobj _ _
|
||||||
|
13 for _ ADP IN _ _ case _ _
|
||||||
|
14 years _ NOUN NNS _ _ nmod _ _
|
||||||
|
15 to _ PART TO _ _ mark _ _
|
||||||
|
16 come _ VERB VB _ _ acl _ _
|
||||||
|
17 . _ PUNCT . _ _ punct _ _
|
||||||
|
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||||
|
"""
|
||||||
|
conllu2docs(input_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4674():
|
||||||
|
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
||||||
|
nlp = English()
|
||||||
|
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
||||||
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
|
kb.set_entities(
|
||||||
|
entity_list=["Q1", "Q1"],
|
||||||
|
freq_list=[32, 111],
|
||||||
|
vector_list=[vector1, vector2],
|
||||||
|
)
|
||||||
|
assert kb.get_size_entities() == 1
|
||||||
|
# dumping to file & loading back in
|
||||||
|
with make_tempdir() as d:
|
||||||
|
dir_path = ensure_path(d)
|
||||||
|
if not dir_path.exists():
|
||||||
|
dir_path.mkdir()
|
||||||
|
file_path = dir_path / "kb"
|
||||||
|
kb.dump(str(file_path))
|
||||||
|
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
||||||
|
kb2.load_bulk(str(file_path))
|
||||||
|
assert kb2.get_size_entities() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4707():
|
||||||
|
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||||
|
by default when loading a model.
|
||||||
|
"""
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
||||||
|
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
||||||
|
exclude = ["tokenizer", "sentencizer"]
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
nlp.to_disk(tmpdir, exclude=exclude)
|
||||||
|
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
||||||
|
assert "sentencizer" not in new_nlp.pipe_names
|
||||||
|
assert "entity_ruler" in new_nlp.pipe_names
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4725_1():
|
||||||
|
""" Ensure the pickling of the NER goes well"""
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
||||||
|
with make_tempdir() as tmp_path:
|
||||||
|
with (tmp_path / "ner.pkl").open("wb") as file_:
|
||||||
|
pickle.dump(ner, file_)
|
||||||
|
assert ner.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
with (tmp_path / "ner.pkl").open("rb") as file_:
|
||||||
|
ner2 = pickle.load(file_)
|
||||||
|
assert ner2.cfg["min_action_freq"] == 342
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
|
def test_issue4725_2():
|
||||||
|
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
||||||
|
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
||||||
|
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
||||||
|
data = numpy.ndarray((5, 3), dtype="f")
|
||||||
|
data[0] = 1.0
|
||||||
|
data[1] = 2.0
|
||||||
|
vocab.set_vector("cat", data[0])
|
||||||
|
vocab.set_vector("dog", data[1])
|
||||||
|
nlp = English(vocab=vocab)
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
nlp.begin_training()
|
||||||
|
docs = ["Kurt is in London."] * 10
|
||||||
|
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4849():
|
||||||
|
nlp = English()
|
||||||
|
ruler = EntityRuler(
|
||||||
|
nlp,
|
||||||
|
patterns=[
|
||||||
|
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
||||||
|
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
||||||
|
],
|
||||||
|
phrase_matcher_attr="LOWER",
|
||||||
|
)
|
||||||
|
nlp.add_pipe(ruler)
|
||||||
|
text = """
|
||||||
|
The left is starting to take aim at Democratic front-runner Joe Biden.
|
||||||
|
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
||||||
|
"""
|
||||||
|
# USING 1 PROCESS
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=1):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert count_ents == 2
|
||||||
|
# USING 2 PROCESSES
|
||||||
|
count_ents = 0
|
||||||
|
for doc in nlp.pipe([text], n_process=2):
|
||||||
|
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
||||||
|
assert count_ents == 2
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPipe:
|
||||||
|
name = "my_pipe"
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Span.set_extension("my_ext", getter=self._get_my_ext)
|
||||||
|
Doc.set_extension("my_ext", default=None)
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
gathered_ext = []
|
||||||
|
for sent in doc.sents:
|
||||||
|
sent_ext = self._get_my_ext(sent)
|
||||||
|
sent._.set("my_ext", sent_ext)
|
||||||
|
gathered_ext.append(sent_ext)
|
||||||
|
|
||||||
|
doc._.set("my_ext", "\n".join(gathered_ext))
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_my_ext(span):
|
||||||
|
return str(span.end)
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4903():
|
||||||
|
"""Ensure that this runs correctly and doesn't hang or crash on Windows /
|
||||||
|
macOS."""
|
||||||
|
nlp = English()
|
||||||
|
custom_component = CustomPipe()
|
||||||
|
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||||
|
nlp.add_pipe(custom_component, after="sentencizer")
|
||||||
|
|
||||||
|
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
||||||
|
docs = list(nlp.pipe(text, n_process=2))
|
||||||
|
assert docs[0].text == "I like bananas."
|
||||||
|
assert docs[1].text == "Do you like them?"
|
||||||
|
assert docs[2].text == "No, I prefer wasabi."
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4924():
|
||||||
|
nlp = Language()
|
||||||
|
example = Example.from_dict(nlp.make_doc(""), {})
|
||||||
|
nlp.evaluate([example])
|
|
@ -1,16 +0,0 @@
|
||||||
from spacy.tokens import Doc, DocBin
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4528(en_vocab):
|
|
||||||
"""Test that user_data is correctly serialized in DocBin."""
|
|
||||||
doc = Doc(en_vocab, words=["hello", "world"])
|
|
||||||
doc.user_data["foo"] = "bar"
|
|
||||||
# This is how extension attribute values are stored in the user data
|
|
||||||
doc.user_data[("._.", "foo", None, None)] = "bar"
|
|
||||||
doc_bin = DocBin(store_user_data=True)
|
|
||||||
doc_bin.add(doc)
|
|
||||||
doc_bin_bytes = doc_bin.to_bytes()
|
|
||||||
new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
|
|
||||||
new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
|
|
||||||
assert new_doc.user_data["foo"] == "bar"
|
|
||||||
assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
|
|
|
@ -1,11 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from spacy.gold import Example
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
|
||||||
)
|
|
||||||
def test_gold_misaligned(en_tokenizer, text, words):
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
Example.from_dict(doc, {"words": words})
|
|
|
@ -1,35 +0,0 @@
|
||||||
from mock import Mock
|
|
||||||
from spacy.matcher import DependencyMatcher
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4590(en_vocab):
|
|
||||||
"""Test that matches param in on_match method are the same as matches run with no on_match method"""
|
|
||||||
pattern = [
|
|
||||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
|
||||||
{
|
|
||||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
|
||||||
"PATTERN": {"ORTH": "fox"},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"},
|
|
||||||
"PATTERN": {"ORTH": "fox"},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
on_match = Mock()
|
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
|
||||||
matcher.add("pattern", on_match, pattern)
|
|
||||||
|
|
||||||
text = "The quick brown fox jumped over the lazy fox"
|
|
||||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
|
||||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "det", "amod", "pobj"]
|
|
||||||
|
|
||||||
doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
|
|
||||||
|
|
||||||
matches = matcher(doc)
|
|
||||||
|
|
||||||
on_match_args = on_match.call_args
|
|
||||||
|
|
||||||
assert on_match_args[0][3] == matches
|
|
|
@ -1,62 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4651_with_phrase_matcher_attr():
|
|
||||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
|
||||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
||||||
specified.
|
|
||||||
"""
|
|
||||||
text = "Spacy is a python library for nlp"
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER")
|
|
||||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
||||||
|
|
||||||
nlp_reloaded = English()
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "entityruler"
|
|
||||||
ruler.to_disk(file_path)
|
|
||||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
|
||||||
|
|
||||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
|
||||||
doc_reloaded = nlp_reloaded(text)
|
|
||||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
||||||
|
|
||||||
assert res == res_reloaded
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4651_without_phrase_matcher_attr():
|
|
||||||
"""Test that the EntityRuler PhraseMatcher is deserialize correctly using
|
|
||||||
the method from_disk when the EntityRuler argument phrase_matcher_attr is
|
|
||||||
not specified.
|
|
||||||
"""
|
|
||||||
text = "Spacy is a python library for nlp"
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
ruler = EntityRuler(nlp)
|
|
||||||
patterns = [{"label": "PYTHON_LIB", "pattern": "spacy", "id": "spaCy"}]
|
|
||||||
ruler.add_patterns(patterns)
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
doc = nlp(text)
|
|
||||||
res = [(ent.text, ent.label_, ent.ent_id_) for ent in doc.ents]
|
|
||||||
|
|
||||||
nlp_reloaded = English()
|
|
||||||
with make_tempdir() as d:
|
|
||||||
file_path = d / "entityruler"
|
|
||||||
ruler.to_disk(file_path)
|
|
||||||
ruler_reloaded = EntityRuler(nlp_reloaded).from_disk(file_path)
|
|
||||||
|
|
||||||
nlp_reloaded.add_pipe(ruler_reloaded)
|
|
||||||
doc_reloaded = nlp_reloaded(text)
|
|
||||||
res_reloaded = [(ent.text, ent.label_, ent.ent_id_) for ent in doc_reloaded.ents]
|
|
||||||
|
|
||||||
assert res == res_reloaded
|
|
|
@ -1,35 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
# TODO
|
|
||||||
# from spacy.gold.converters.conllu2docs import conllu2docs
|
|
||||||
|
|
||||||
input_data = """
|
|
||||||
1 [ _ PUNCT -LRB- _ _ punct _ _
|
|
||||||
2 This _ DET DT _ _ det _ _
|
|
||||||
3 killing _ NOUN NN _ _ nsubj _ _
|
|
||||||
4 of _ ADP IN _ _ case _ _
|
|
||||||
5 a _ DET DT _ _ det _ _
|
|
||||||
6 respected _ ADJ JJ _ _ amod _ _
|
|
||||||
7 cleric _ NOUN NN _ _ nmod _ _
|
|
||||||
8 will _ AUX MD _ _ aux _ _
|
|
||||||
9 be _ AUX VB _ _ aux _ _
|
|
||||||
10 causing _ VERB VBG _ _ root _ _
|
|
||||||
11 us _ PRON PRP _ _ iobj _ _
|
|
||||||
12 trouble _ NOUN NN _ _ dobj _ _
|
|
||||||
13 for _ ADP IN _ _ case _ _
|
|
||||||
14 years _ NOUN NNS _ _ nmod _ _
|
|
||||||
15 to _ PART TO _ _ mark _ _
|
|
||||||
16 come _ VERB VB _ _ acl _ _
|
|
||||||
17 . _ PUNCT . _ _ punct _ _
|
|
||||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_issue4665():
|
|
||||||
"""
|
|
||||||
conllu2json should not raise an exception if the HEAD column contains an
|
|
||||||
underscore
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
# conllu2json(input_data)
|
|
|
@ -1,36 +0,0 @@
|
||||||
import pytest
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
from spacy.util import ensure_path
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4674():
|
|
||||||
"""Test that setting entities with overlapping identifiers does not mess up IO"""
|
|
||||||
nlp = English()
|
|
||||||
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
|
|
||||||
|
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
|
||||||
with pytest.warns(UserWarning):
|
|
||||||
kb.set_entities(
|
|
||||||
entity_list=["Q1", "Q1"],
|
|
||||||
freq_list=[32, 111],
|
|
||||||
vector_list=[vector1, vector2],
|
|
||||||
)
|
|
||||||
|
|
||||||
assert kb.get_size_entities() == 1
|
|
||||||
|
|
||||||
# dumping to file & loading back in
|
|
||||||
with make_tempdir() as d:
|
|
||||||
dir_path = ensure_path(d)
|
|
||||||
if not dir_path.exists():
|
|
||||||
dir_path.mkdir()
|
|
||||||
file_path = dir_path / "kb"
|
|
||||||
kb.dump(str(file_path))
|
|
||||||
|
|
||||||
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
|
|
||||||
kb2.load_bulk(str(file_path))
|
|
||||||
|
|
||||||
assert kb2.get_size_entities() == 1
|
|
|
@ -1,20 +0,0 @@
|
||||||
from spacy.util import load_model_from_path
|
|
||||||
from spacy.lang.en import English
|
|
||||||
|
|
||||||
from ..util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4707():
|
|
||||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
|
||||||
by default when loading a model.
|
|
||||||
"""
|
|
||||||
nlp = English()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(nlp.create_pipe("entity_ruler"))
|
|
||||||
assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
|
|
||||||
exclude = ["tokenizer", "sentencizer"]
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
nlp.to_disk(tmpdir, exclude=exclude)
|
|
||||||
new_nlp = load_model_from_path(tmpdir, disable=exclude)
|
|
||||||
assert "sentencizer" not in new_nlp.pipe_names
|
|
||||||
assert "entity_ruler" in new_nlp.pipe_names
|
|
|
@ -1,41 +0,0 @@
|
||||||
import pickle
|
|
||||||
import numpy
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.vocab import Vocab
|
|
||||||
|
|
||||||
from spacy.tests.util import make_tempdir
|
|
||||||
|
|
||||||
|
|
||||||
def test_pickle_ner():
|
|
||||||
""" Ensure the pickling of the NER goes well"""
|
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
|
||||||
nlp = English(vocab=vocab)
|
|
||||||
ner = nlp.create_pipe("ner", config={"min_action_freq": 342})
|
|
||||||
with make_tempdir() as tmp_path:
|
|
||||||
with (tmp_path / "ner.pkl").open("wb") as file_:
|
|
||||||
pickle.dump(ner, file_)
|
|
||||||
assert ner.cfg["min_action_freq"] == 342
|
|
||||||
|
|
||||||
with (tmp_path / "ner.pkl").open("rb") as file_:
|
|
||||||
ner2 = pickle.load(file_)
|
|
||||||
assert ner2.cfg["min_action_freq"] == 342
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4725():
|
|
||||||
# ensures that this runs correctly and doesn't hang or crash because of the global vectors
|
|
||||||
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows)
|
|
||||||
vocab = Vocab(vectors_name="test_vocab_add_vector")
|
|
||||||
data = numpy.ndarray((5, 3), dtype="f")
|
|
||||||
data[0] = 1.0
|
|
||||||
data[1] = 2.0
|
|
||||||
vocab.set_vector("cat", data[0])
|
|
||||||
vocab.set_vector("dog", data[1])
|
|
||||||
|
|
||||||
nlp = English(vocab=vocab)
|
|
||||||
ner = nlp.create_pipe("ner")
|
|
||||||
nlp.add_pipe(ner)
|
|
||||||
nlp.begin_training()
|
|
||||||
docs = ["Kurt is in London."] * 10
|
|
||||||
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
|
|
||||||
pass
|
|
|
@ -1,34 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.pipeline import EntityRuler
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4849():
|
|
||||||
nlp = English()
|
|
||||||
|
|
||||||
ruler = EntityRuler(
|
|
||||||
nlp,
|
|
||||||
patterns=[
|
|
||||||
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
|
|
||||||
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
|
|
||||||
],
|
|
||||||
phrase_matcher_attr="LOWER",
|
|
||||||
)
|
|
||||||
|
|
||||||
nlp.add_pipe(ruler)
|
|
||||||
|
|
||||||
text = """
|
|
||||||
The left is starting to take aim at Democratic front-runner Joe Biden.
|
|
||||||
Sen. Bernie Sanders joined in her criticism: "There is no 'middle ground' when it comes to climate policy."
|
|
||||||
"""
|
|
||||||
|
|
||||||
# USING 1 PROCESS
|
|
||||||
count_ents = 0
|
|
||||||
for doc in nlp.pipe([text], n_process=1):
|
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
|
||||||
assert count_ents == 2
|
|
||||||
|
|
||||||
# USING 2 PROCESSES
|
|
||||||
count_ents = 0
|
|
||||||
for doc in nlp.pipe([text], n_process=2):
|
|
||||||
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
|
|
||||||
assert count_ents == 2
|
|
|
@ -1,40 +0,0 @@
|
||||||
from spacy.lang.en import English
|
|
||||||
from spacy.tokens import Span, Doc
|
|
||||||
|
|
||||||
|
|
||||||
class CustomPipe:
|
|
||||||
name = "my_pipe"
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
Span.set_extension("my_ext", getter=self._get_my_ext)
|
|
||||||
Doc.set_extension("my_ext", default=None)
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
gathered_ext = []
|
|
||||||
for sent in doc.sents:
|
|
||||||
sent_ext = self._get_my_ext(sent)
|
|
||||||
sent._.set("my_ext", sent_ext)
|
|
||||||
gathered_ext.append(sent_ext)
|
|
||||||
|
|
||||||
doc._.set("my_ext", "\n".join(gathered_ext))
|
|
||||||
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_my_ext(span):
|
|
||||||
return str(span.end)
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4903():
|
|
||||||
# ensures that this runs correctly and doesn't hang or crash on Windows / macOS
|
|
||||||
|
|
||||||
nlp = English()
|
|
||||||
custom_component = CustomPipe()
|
|
||||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
|
||||||
nlp.add_pipe(custom_component, after="sentencizer")
|
|
||||||
|
|
||||||
text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
|
|
||||||
docs = list(nlp.pipe(text, n_process=2))
|
|
||||||
assert docs[0].text == "I like bananas."
|
|
||||||
assert docs[1].text == "Do you like them?"
|
|
||||||
assert docs[2].text == "No, I prefer wasabi."
|
|
|
@ -1,8 +0,0 @@
|
||||||
from spacy.gold import Example
|
|
||||||
from spacy.language import Language
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4924():
|
|
||||||
nlp = Language()
|
|
||||||
example = Example.from_dict(nlp.make_doc(""), {})
|
|
||||||
nlp.evaluate([example])
|
|
|
@ -1,6 +1,8 @@
|
||||||
|
import pytest
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_issue5152():
|
def test_issue5152():
|
||||||
# Test that the comparison between a Span and a Token, goes well
|
# Test that the comparison between a Span and a Token, goes well
|
||||||
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||||
|
@ -8,7 +10,6 @@ def test_issue5152():
|
||||||
text = nlp("Talk about being boring!")
|
text = nlp("Talk about being boring!")
|
||||||
text_var = nlp("Talk of being boring!")
|
text_var = nlp("Talk of being boring!")
|
||||||
y = nlp("Let")
|
y = nlp("Let")
|
||||||
|
|
||||||
span = text[0:3] # Talk about being
|
span = text[0:3] # Talk about being
|
||||||
span_2 = text[0:3] # Talk about being
|
span_2 = text[0:3] # Talk about being
|
||||||
span_3 = text_var[0:3] # Talk of being
|
span_3 = text_var[0:3] # Talk of being
|
||||||
|
|
|
@ -63,7 +63,8 @@ def tagger():
|
||||||
# need to add model for two reasons:
|
# need to add model for two reasons:
|
||||||
# 1. no model leads to error in serialization,
|
# 1. no model leads to error in serialization,
|
||||||
# 2. the affected line is the one for model serialization
|
# 2. the affected line is the one for model serialization
|
||||||
tagger.begin_training(pipeline=nlp.pipeline)
|
with pytest.warns(UserWarning):
|
||||||
|
tagger.begin_training(pipeline=nlp.pipeline)
|
||||||
return tagger
|
return tagger
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user