mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-07 15:56:32 +03:00
483dddc9bc
* Add custom MatchPatternError * Improve validators and add validation option to Matcher * Adjust formatting * Never validate in Matcher within PhraseMatcher If we do decide to make validate default to True, the PhraseMatcher's Matcher shouldn't ever validate. Here, we create the patterns automatically anyways (and it's currently unclear whether the validation has performance impacts at a very large scale).
68 lines
2.2 KiB
Python
68 lines
2.2 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
from spacy.cli._schemas import TRAINING_SCHEMA
|
|
from spacy.util import get_json_validator, validate_json
|
|
from spacy.tokens import Doc
|
|
from ..util import get_doc
|
|
|
|
|
|
@pytest.fixture()
|
|
def doc(en_vocab):
|
|
words = ["c", "d", "e"]
|
|
pos = ["VERB", "NOUN", "NOUN"]
|
|
tags = ["VBP", "NN", "NN"]
|
|
heads = [0, -1, -2]
|
|
deps = ["ROOT", "dobj", "dobj"]
|
|
ents = [(1, 2, "ORG")]
|
|
return get_doc(
|
|
en_vocab, words=words, pos=pos, tags=tags, heads=heads, deps=deps, ents=ents
|
|
)
|
|
|
|
|
|
def test_doc_to_json(doc):
|
|
json_doc = doc.to_json()
|
|
assert json_doc["text"] == "c d e "
|
|
assert len(json_doc["tokens"]) == 3
|
|
assert json_doc["tokens"][0]["pos"] == "VERB"
|
|
assert json_doc["tokens"][0]["tag"] == "VBP"
|
|
assert json_doc["tokens"][0]["dep"] == "ROOT"
|
|
assert len(json_doc["ents"]) == 1
|
|
assert json_doc["ents"][0]["start"] == 2 # character offset!
|
|
assert json_doc["ents"][0]["end"] == 3 # character offset!
|
|
assert json_doc["ents"][0]["label"] == "ORG"
|
|
|
|
|
|
def test_doc_to_json_underscore(doc):
|
|
Doc.set_extension("json_test1", default=False)
|
|
Doc.set_extension("json_test2", default=False)
|
|
doc._.json_test1 = "hello world"
|
|
doc._.json_test2 = [1, 2, 3]
|
|
json_doc = doc.to_json(underscore=["json_test1", "json_test2"])
|
|
assert "_" in json_doc
|
|
assert json_doc["_"]["json_test1"] == "hello world"
|
|
assert json_doc["_"]["json_test2"] == [1, 2, 3]
|
|
|
|
|
|
def test_doc_to_json_underscore_error_attr(doc):
|
|
"""Test that Doc.to_json() raises an error if a custom attribute doesn't
|
|
exist in the ._ space."""
|
|
with pytest.raises(ValueError):
|
|
doc.to_json(underscore=["json_test3"])
|
|
|
|
|
|
def test_doc_to_json_underscore_error_serialize(doc):
|
|
"""Test that Doc.to_json() raises an error if a custom attribute value
|
|
isn't JSON-serializable."""
|
|
Doc.set_extension("json_test4", method=lambda doc: doc.text)
|
|
with pytest.raises(ValueError):
|
|
doc.to_json(underscore=["json_test4"])
|
|
|
|
|
|
def test_doc_to_json_valid_training(doc):
|
|
json_doc = doc.to_json()
|
|
validator = get_json_validator(TRAINING_SCHEMA)
|
|
errors = validate_json([json_doc], validator)
|
|
assert not errors
|