mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			269 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			269 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import itertools
 | 
						|
import pytest
 | 
						|
from spacy.language import Language
 | 
						|
from spacy.tokens import Doc, Span
 | 
						|
from spacy.vocab import Vocab
 | 
						|
from spacy.gold import Example
 | 
						|
from spacy.lang.en import English
 | 
						|
from spacy.util import registry
 | 
						|
 | 
						|
from .util import add_vecs_to_vocab, assert_docs_equal
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def nlp():
 | 
						|
    nlp = Language(Vocab())
 | 
						|
    textcat = nlp.add_pipe("textcat")
 | 
						|
    for label in ("POSITIVE", "NEGATIVE"):
 | 
						|
        textcat.add_label(label)
 | 
						|
    nlp.begin_training()
 | 
						|
    return nlp
 | 
						|
 | 
						|
 | 
						|
def test_language_update(nlp):
 | 
						|
    text = "hello world"
 | 
						|
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
 | 
						|
    wrongkeyannots = {"LABEL": True}
 | 
						|
    doc = Doc(nlp.vocab, words=text.split(" "))
 | 
						|
    example = Example.from_dict(doc, annots)
 | 
						|
    nlp.update([example])
 | 
						|
 | 
						|
    # Not allowed to call with just one Example
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.update(example)
 | 
						|
 | 
						|
    # Update with text and dict: not supported anymore since v.3
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.update((text, annots))
 | 
						|
    # Update with doc object and dict
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.update((doc, annots))
 | 
						|
 | 
						|
    # Create examples badly
 | 
						|
    with pytest.raises(ValueError):
 | 
						|
        example = Example.from_dict(doc, None)
 | 
						|
    with pytest.raises(KeyError):
 | 
						|
        example = Example.from_dict(doc, wrongkeyannots)
 | 
						|
 | 
						|
 | 
						|
def test_language_evaluate(nlp):
 | 
						|
    text = "hello world"
 | 
						|
    annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
 | 
						|
    doc = Doc(nlp.vocab, words=text.split(" "))
 | 
						|
    example = Example.from_dict(doc, annots)
 | 
						|
    nlp.evaluate([example])
 | 
						|
 | 
						|
    # Not allowed to call with just one Example
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.evaluate(example)
 | 
						|
 | 
						|
    # Evaluate with text and dict: not supported anymore since v.3
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.evaluate([(text, annots)])
 | 
						|
    # Evaluate with doc object and dict
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.evaluate([(doc, annots)])
 | 
						|
    with pytest.raises(TypeError):
 | 
						|
        nlp.evaluate([text, annots])
 | 
						|
 | 
						|
 | 
						|
def test_evaluate_no_pipe(nlp):
 | 
						|
    """Test that docs are processed correctly within Language.pipe if the
 | 
						|
    component doesn't expose a .pipe method."""
 | 
						|
 | 
						|
    @Language.component("test_evaluate_no_pipe")
 | 
						|
    def pipe(doc):
 | 
						|
        return doc
 | 
						|
 | 
						|
    text = "hello world"
 | 
						|
    annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
 | 
						|
    nlp = Language(Vocab())
 | 
						|
    doc = nlp(text)
 | 
						|
    nlp.add_pipe("test_evaluate_no_pipe")
 | 
						|
    nlp.evaluate([Example.from_dict(doc, annots)])
 | 
						|
 | 
						|
 | 
						|
@Language.component("test_language_vector_modification_pipe")
 | 
						|
def vector_modification_pipe(doc):
 | 
						|
    doc.vector += 1
 | 
						|
    return doc
 | 
						|
 | 
						|
 | 
						|
@Language.component("test_language_userdata_pipe")
 | 
						|
def userdata_pipe(doc):
 | 
						|
    doc.user_data["foo"] = "bar"
 | 
						|
    return doc
 | 
						|
 | 
						|
 | 
						|
@Language.component("test_language_ner_pipe")
 | 
						|
def ner_pipe(doc):
 | 
						|
    span = Span(doc, 0, 1, label="FIRST")
 | 
						|
    doc.ents += (span,)
 | 
						|
    return doc
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def sample_vectors():
 | 
						|
    return [
 | 
						|
        ("spacy", [-0.1, -0.2, -0.3]),
 | 
						|
        ("world", [-0.2, -0.3, -0.4]),
 | 
						|
        ("pipe", [0.7, 0.8, 0.9]),
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def nlp2(nlp, sample_vectors):
 | 
						|
    add_vecs_to_vocab(nlp.vocab, sample_vectors)
 | 
						|
    nlp.add_pipe("test_language_vector_modification_pipe")
 | 
						|
    nlp.add_pipe("test_language_ner_pipe")
 | 
						|
    nlp.add_pipe("test_language_userdata_pipe")
 | 
						|
    return nlp
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def texts():
 | 
						|
    data = [
 | 
						|
        "Hello world.",
 | 
						|
        "This is spacy.",
 | 
						|
        "You can use multiprocessing with pipe method.",
 | 
						|
        "Please try!",
 | 
						|
    ]
 | 
						|
    return data
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("n_process", [1, 2])
 | 
						|
def test_language_pipe(nlp2, n_process, texts):
 | 
						|
    texts = texts * 10
 | 
						|
    expecteds = [nlp2(text) for text in texts]
 | 
						|
    docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
 | 
						|
 | 
						|
    for doc, expected_doc in zip(docs, expecteds):
 | 
						|
        assert_docs_equal(doc, expected_doc)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("n_process", [1, 2])
 | 
						|
def test_language_pipe_stream(nlp2, n_process, texts):
 | 
						|
    # check if nlp.pipe can handle infinite length iterator properly.
 | 
						|
    stream_texts = itertools.cycle(texts)
 | 
						|
    texts0, texts1 = itertools.tee(stream_texts)
 | 
						|
    expecteds = (nlp2(text) for text in texts0)
 | 
						|
    docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
 | 
						|
 | 
						|
    n_fetch = 20
 | 
						|
    for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
 | 
						|
        assert_docs_equal(doc, expected_doc)
 | 
						|
 | 
						|
 | 
						|
def test_language_from_config_before_after_init():
 | 
						|
    name = "test_language_from_config_before_after_init"
 | 
						|
    ran_before = False
 | 
						|
    ran_after = False
 | 
						|
    ran_after_pipeline = False
 | 
						|
 | 
						|
    @registry.callbacks(f"{name}_before")
 | 
						|
    def make_before_creation():
 | 
						|
        def before_creation(lang_cls):
 | 
						|
            nonlocal ran_before
 | 
						|
            ran_before = True
 | 
						|
            assert lang_cls is English
 | 
						|
            lang_cls.Defaults.foo = "bar"
 | 
						|
            return lang_cls
 | 
						|
 | 
						|
        return before_creation
 | 
						|
 | 
						|
    @registry.callbacks(f"{name}_after")
 | 
						|
    def make_after_creation():
 | 
						|
        def after_creation(nlp):
 | 
						|
            nonlocal ran_after
 | 
						|
            ran_after = True
 | 
						|
            assert isinstance(nlp, English)
 | 
						|
            assert nlp.pipe_names == []
 | 
						|
            assert nlp.Defaults.foo == "bar"
 | 
						|
            nlp.meta["foo"] = "bar"
 | 
						|
            return nlp
 | 
						|
 | 
						|
        return after_creation
 | 
						|
 | 
						|
    @registry.callbacks(f"{name}_after_pipeline")
 | 
						|
    def make_after_pipeline_creation():
 | 
						|
        def after_pipeline_creation(nlp):
 | 
						|
            nonlocal ran_after_pipeline
 | 
						|
            ran_after_pipeline = True
 | 
						|
            assert isinstance(nlp, English)
 | 
						|
            assert nlp.pipe_names == ["sentencizer"]
 | 
						|
            assert nlp.Defaults.foo == "bar"
 | 
						|
            assert nlp.meta["foo"] == "bar"
 | 
						|
            nlp.meta["bar"] = "baz"
 | 
						|
            return nlp
 | 
						|
 | 
						|
        return after_pipeline_creation
 | 
						|
 | 
						|
    config = {
 | 
						|
        "nlp": {
 | 
						|
            "pipeline": ["sentencizer"],
 | 
						|
            "before_creation": {"@callbacks": f"{name}_before"},
 | 
						|
            "after_creation": {"@callbacks": f"{name}_after"},
 | 
						|
            "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
 | 
						|
        },
 | 
						|
        "components": {"sentencizer": {"factory": "sentencizer"}},
 | 
						|
    }
 | 
						|
    nlp = English.from_config(config)
 | 
						|
    assert all([ran_before, ran_after, ran_after_pipeline])
 | 
						|
    assert nlp.Defaults.foo == "bar"
 | 
						|
    assert nlp.meta["foo"] == "bar"
 | 
						|
    assert nlp.meta["bar"] == "baz"
 | 
						|
    assert nlp.pipe_names == ["sentencizer"]
 | 
						|
    assert nlp("text")
 | 
						|
 | 
						|
 | 
						|
def test_language_from_config_before_after_init_invalid():
 | 
						|
    """Check that an error is raised if function doesn't return nlp."""
 | 
						|
    name = "test_language_from_config_before_after_init_invalid"
 | 
						|
    registry.callbacks(f"{name}_before1", func=lambda: lambda nlp: None)
 | 
						|
    registry.callbacks(f"{name}_before2", func=lambda: lambda nlp: nlp())
 | 
						|
    registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: None)
 | 
						|
    registry.callbacks(f"{name}_after1", func=lambda: lambda nlp: English)
 | 
						|
 | 
						|
    for callback_name in [f"{name}_before1", f"{name}_before2"]:
 | 
						|
        config = {"nlp": {"before_creation": {"@callbacks": callback_name}}}
 | 
						|
        with pytest.raises(ValueError):
 | 
						|
            English.from_config(config)
 | 
						|
    for callback_name in [f"{name}_after1", f"{name}_after2"]:
 | 
						|
        config = {"nlp": {"after_creation": {"@callbacks": callback_name}}}
 | 
						|
        with pytest.raises(ValueError):
 | 
						|
            English.from_config(config)
 | 
						|
    for callback_name in [f"{name}_after1", f"{name}_after2"]:
 | 
						|
        config = {"nlp": {"after_pipeline_creation": {"@callbacks": callback_name}}}
 | 
						|
        with pytest.raises(ValueError):
 | 
						|
            English.from_config(config)
 | 
						|
 | 
						|
 | 
						|
def test_language_custom_tokenizer():
 | 
						|
    """Test that a fully custom tokenizer can be plugged in via the registry."""
 | 
						|
    name = "test_language_custom_tokenizer"
 | 
						|
 | 
						|
    class CustomTokenizer:
 | 
						|
        """Dummy "tokenizer" that splits on spaces and adds prefix to each word."""
 | 
						|
 | 
						|
        def __init__(self, nlp, prefix):
 | 
						|
            self.vocab = nlp.vocab
 | 
						|
            self.prefix = prefix
 | 
						|
 | 
						|
        def __call__(self, text):
 | 
						|
            words = [f"{self.prefix}{word}" for word in text.split(" ")]
 | 
						|
            return Doc(self.vocab, words=words)
 | 
						|
 | 
						|
    @registry.tokenizers(name)
 | 
						|
    def custom_create_tokenizer(prefix: str = "_"):
 | 
						|
        def create_tokenizer(nlp):
 | 
						|
            return CustomTokenizer(nlp, prefix=prefix)
 | 
						|
 | 
						|
        return create_tokenizer
 | 
						|
 | 
						|
    config = {"nlp": {"tokenizer": {"@tokenizers": name}}}
 | 
						|
    nlp = English.from_config(config)
 | 
						|
    doc = nlp("hello world")
 | 
						|
    assert [t.text for t in doc] == ["_hello", "_world"]
 | 
						|
    doc = list(nlp.pipe(["hello world"]))[0]
 | 
						|
    assert [t.text for t in doc] == ["_hello", "_world"]
 |