Add plain text corpus tests

2025-09-21 11:32:38 +03:00 · 2023-01-19 10:56:41 +01:00 · 2023-01-19 10:56:41 +01:00 · 2c5a36ac28
commit 2c5a36ac28
parent 1da9f15d62
1 changed files with 75 additions and 0 deletions
--- a/spacy/tests/training/test_corpus.py
+++ b/spacy/tests/training/test_corpus.py
@ -0,0 +1,75 @@
 from typing import Iterable, List, Tuple
 from contextlib import contextmanager
 from pathlib import Path
 import pytest
 import tempfile
 from spacy.lang.en import English
 from spacy.training import Example, PlainTextCorpus
 # Intentional newlines to check that they are skipped.
 PLAIN_TEXT_DOC = """
 This is a doc. It contains two sentences.
 This is another doc.
 A third doc.
 """
 PLAIN_TEXT_DOC_TOKENIZED = [
    [
        "This",
        "is",
        "a",
        "doc",
        ".",
        "It",
        "contains",
        "two",
        "sentences",
        ".",
    ],
    ["This", "is", "another", "doc", "."],
    ["A", "third", "doc", "."],
 ]
@pytest.mark.parametrize("min_length, max_length", [(0, 0), (0, 5), (5, 0), (5, 5)])
 def test_plain_text_reader(min_length, max_length):
    nlp = English()
    with _string_to_tmp_file(PLAIN_TEXT_DOC) as f:
        corpus = PlainTextCorpus(
            Path(f.name), min_length=min_length, max_length=max_length
        )
        check = [
            doc
            for doc in PLAIN_TEXT_DOC_TOKENIZED
            if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
        ]
        reference, predicted = _examples_to_tokens(corpus(nlp))
        assert reference == check
        assert predicted == check
@contextmanager
 def _string_to_tmp_file(s: str):
    with tempfile.NamedTemporaryFile(suffix=".txt") as f:
        f.write(s.encode("utf-8"))
        f.seek(0)
        yield f
 def _examples_to_tokens(
    examples: Iterable[Example],
 ) -> Tuple[List[List[str]], List[List[str]]]:
    reference = []
    predicted = []
    for eg in examples:
        reference.append([t.text for t in eg.reference])
        predicted.append([t.text for t in eg.predicted])
    return reference, predicted