mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
e48a09df4e
* OrigAnnot class instead of gold.orig_annot list of zipped tuples * from_orig to replace from_annot_tuples * rename to RawAnnot * some unit tests for GoldParse creation and internal format * removing orig_annot and switching to lists instead of tuple * rewriting tuples to use RawAnnot (+ debug statements, WIP) * fix pop() changing the data * small fixes * pop-append fixes * return RawAnnot for existing GoldParse to have uniform interface * clean up imports * fix merge_sents * add unit test for 4402 with new structure (not working yet) * introduce DocAnnot * typo fixes * add unit test for merge_sents * rename from_orig to from_raw * fixing unit tests * fix nn parser * read_annots to produce text, doc_annot pairs * _make_golds fix * rename golds_to_gold_annots * small fixes * fix encoding * have golds_to_gold_annots use DocAnnot * missed a spot * merge_sents as function in DocAnnot * allow specifying only part of the token-level annotations * refactor with Example class + underlying dicts * pipeline components to work with Example objects (wip) * input checking * fix yielding * fix calls to update * small fixes * fix scorer unit test with new format * fix kwargs order * fixes for ud and conllu scripts * fix reading data for conllu script * add in proper errors (not fixed numbering yet to avoid merge conflicts) * fixing few more small bugs * fix EL script
137 lines
3.6 KiB
Python
137 lines
3.6 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import itertools
|
|
|
|
import pytest
|
|
from spacy.compat import is_python2
|
|
from spacy.gold import GoldParse
|
|
from spacy.language import Language
|
|
from spacy.tokens import Doc, Span
|
|
from spacy.vocab import Vocab
|
|
|
|
from .util import add_vecs_to_vocab, assert_docs_equal
|
|
|
|
|
|
@pytest.fixture
|
|
def nlp():
|
|
nlp = Language(Vocab())
|
|
textcat = nlp.create_pipe("textcat")
|
|
for label in ("POSITIVE", "NEGATIVE"):
|
|
textcat.add_label(label)
|
|
nlp.add_pipe(textcat)
|
|
nlp.begin_training()
|
|
return nlp
|
|
|
|
|
|
def test_language_update(nlp):
|
|
text = "hello world"
|
|
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
|
wrongkeyannots = {"LABEL": True}
|
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
|
gold = GoldParse(doc, **annots)
|
|
# Update with doc and gold objects
|
|
nlp.update((doc, gold))
|
|
# Update with text and dict
|
|
nlp.update((text, annots))
|
|
# Update with doc object and dict
|
|
nlp.update((doc, annots))
|
|
# Update with text and gold object
|
|
nlp.update((text, gold))
|
|
# Update with empty doc and gold object
|
|
nlp.update((None, gold))
|
|
# Update badly
|
|
with pytest.raises(ValueError):
|
|
nlp.update((doc, None))
|
|
with pytest.raises(TypeError):
|
|
nlp.update((text, wrongkeyannots))
|
|
|
|
|
|
def test_language_evaluate(nlp):
|
|
text = "hello world"
|
|
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
|
gold = GoldParse(doc, **annots)
|
|
# Evaluate with doc and gold objects
|
|
nlp.evaluate([(doc, gold)])
|
|
# Evaluate with text and dict
|
|
nlp.evaluate([(text, annots)])
|
|
# Evaluate with doc object and dict
|
|
nlp.evaluate([(doc, annots)])
|
|
# Evaluate with text and gold object
|
|
nlp.evaluate([(text, gold)])
|
|
# Evaluate badly
|
|
with pytest.raises(Exception):
|
|
nlp.evaluate([text, gold])
|
|
|
|
|
|
def vector_modification_pipe(doc):
|
|
doc.vector += 1
|
|
return doc
|
|
|
|
|
|
def userdata_pipe(doc):
|
|
doc.user_data["foo"] = "bar"
|
|
return doc
|
|
|
|
|
|
def ner_pipe(doc):
|
|
span = Span(doc, 0, 1, label="FIRST")
|
|
doc.ents += (span,)
|
|
return doc
|
|
|
|
|
|
@pytest.fixture
|
|
def sample_vectors():
|
|
return [
|
|
("spacy", [-0.1, -0.2, -0.3]),
|
|
("world", [-0.2, -0.3, -0.4]),
|
|
("pipe", [0.7, 0.8, 0.9]),
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def nlp2(nlp, sample_vectors):
|
|
add_vecs_to_vocab(nlp.vocab, sample_vectors)
|
|
nlp.add_pipe(vector_modification_pipe)
|
|
nlp.add_pipe(ner_pipe)
|
|
nlp.add_pipe(userdata_pipe)
|
|
return nlp
|
|
|
|
|
|
@pytest.fixture
|
|
def texts():
|
|
data = [
|
|
"Hello world.",
|
|
"This is spacy.",
|
|
"You can use multiprocessing with pipe method.",
|
|
"Please try!",
|
|
]
|
|
return data
|
|
|
|
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe(nlp2, n_process, texts):
|
|
texts = texts * 10
|
|
expecteds = [nlp2(text) for text in texts]
|
|
docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
|
|
|
|
for doc, expected_doc in zip(docs, expecteds):
|
|
assert_docs_equal(doc, expected_doc)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
is_python2, reason="python2 seems to be unable to handle iterator properly"
|
|
)
|
|
@pytest.mark.parametrize("n_process", [1, 2])
|
|
def test_language_pipe_stream(nlp2, n_process, texts):
|
|
# check if nlp.pipe can handle infinite length iterator properly.
|
|
stream_texts = itertools.cycle(texts)
|
|
texts0, texts1 = itertools.tee(stream_texts)
|
|
expecteds = (nlp2(text) for text in texts0)
|
|
docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
|
|
|
|
n_fetch = 20
|
|
for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
|
|
assert_docs_equal(doc, expected_doc)
|