mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 21:54:54 +03:00
better error message
This commit is contained in:
parent
9b404ea33c
commit
56fb0dfe7e
|
@ -45,7 +45,7 @@ def _stream_jsonl(path: Path) -> Iterable[str]:
|
||||||
for entry in srsly.read_jsonl(path):
|
for entry in srsly.read_jsonl(path):
|
||||||
if "text" not in entry:
|
if "text" not in entry:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"JSONL files have to contain 'text' field."
|
f"{path} does not contain the required 'text' field."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
yield entry["text"]
|
yield entry["text"]
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
from random import sample
|
|
||||||
from typing import Counter
|
from typing import Counter
|
||||||
|
|
||||||
|
import spacy
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
from click import NoSuchOption
|
from click import NoSuchOption
|
||||||
|
@ -26,11 +26,12 @@ from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
from spacy.cli.validate import get_model_pkgs
|
from spacy.cli.validate import get_model_pkgs
|
||||||
|
from spacy.cli.apply import apply
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.nl import Dutch
|
from spacy.lang.nl import Dutch
|
||||||
from spacy.language import Language
|
from spacy.language import Language
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.tokens.span import Span
|
from spacy.tokens.span import Span
|
||||||
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
|
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
|
||||||
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
||||||
|
@ -855,3 +856,68 @@ def test_span_length_freq_dist_output_must_be_correct():
|
||||||
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
span_freqs = _get_spans_length_freq_dist(sample_span_lengths, threshold)
|
||||||
assert sum(span_freqs.values()) >= threshold
|
assert sum(span_freqs.values()) >= threshold
|
||||||
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_empty_dir():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = os.path.join(data_path, "test.spacy")
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_docbin():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp("testing apply cli.")
|
||||||
|
# test empty DocBin case
|
||||||
|
docbin = DocBin()
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_jsonl():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
data = [{"text": "Testing apply cli.", "key": 234}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
data = [{"key": 234}]
|
||||||
|
srsly.write_jsonl(data_path / "test2.jsonl", data)
|
||||||
|
# test no "text" field case
|
||||||
|
with pytest.raises(ValueError, match="test2.jsonl"):
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_txt():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
data = [{"text": "Testing apply cli.", "key": 234}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", data)
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
data = [{"key": 234}]
|
||||||
|
srsly.write_jsonl(data_path / "test2.jsonl", data)
|
||||||
|
with pytest.raises(ValueError, match="test2.jsonl"):
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def test_applycli_mixed():
|
||||||
|
with make_tempdir() as data_path:
|
||||||
|
output = data_path / "testout.spacy"
|
||||||
|
text = "Testing apply cli"
|
||||||
|
nlp = spacy.blank("en")
|
||||||
|
doc = nlp(text)
|
||||||
|
jsonl_data = [{"text": text}]
|
||||||
|
srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
|
||||||
|
docbin = DocBin()
|
||||||
|
docbin.add(doc)
|
||||||
|
docbin.to_disk(data_path / "testin.spacy")
|
||||||
|
with open(data_path / "test.txt", "w") as ftest:
|
||||||
|
ftest.write(text)
|
||||||
|
apply(data_path, output, "blank:en", 1, 1)
|
||||||
|
result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
|
||||||
|
assert len(result) == 3
|
||||||
|
for doc in result:
|
||||||
|
assert doc.text == text
|
||||||
|
|
Loading…
Reference in New Issue
Block a user