From 2c5a36ac285410f11fcc13e8e0042d8d48297d3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Jan 2023 10:56:41 +0100
Subject: [PATCH] Add plain text corpus tests

---
 spacy/tests/training/test_corpus.py | 75 +++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 spacy/tests/training/test_corpus.py

diff --git a/spacy/tests/training/test_corpus.py b/spacy/tests/training/test_corpus.py
new file mode 100644
index 000000000..1d90f9ca2
--- /dev/null
+++ b/spacy/tests/training/test_corpus.py
@@ -0,0 +1,75 @@
+from typing import Iterable, List, Tuple
+from contextlib import contextmanager
+from pathlib import Path
+import pytest
+import tempfile
+
+from spacy.lang.en import English
+from spacy.training import Example, PlainTextCorpus
+
+# Intentional newlines to check that they are skipped.
+PLAIN_TEXT_DOC = """
+
+This is a doc. It contains two sentences.
+This is another doc.
+
+A third doc.
+
+"""
+
+PLAIN_TEXT_DOC_TOKENIZED = [
+    [
+        "This",
+        "is",
+        "a",
+        "doc",
+        ".",
+        "It",
+        "contains",
+        "two",
+        "sentences",
+        ".",
+    ],
+    ["This", "is", "another", "doc", "."],
+    ["A", "third", "doc", "."],
+]
+
+
+@pytest.mark.parametrize("min_length, max_length", [(0, 0), (0, 5), (5, 0), (5, 5)])
+def test_plain_text_reader(min_length, max_length):
+    nlp = English()
+    with _string_to_tmp_file(PLAIN_TEXT_DOC) as f:
+        corpus = PlainTextCorpus(
+            Path(f.name), min_length=min_length, max_length=max_length
+        )
+
+        check = [
+            doc
+            for doc in PLAIN_TEXT_DOC_TOKENIZED
+            if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
+        ]
+        reference, predicted = _examples_to_tokens(corpus(nlp))
+
+        assert reference == check
+        assert predicted == check
+
+
+@contextmanager
+def _string_to_tmp_file(s: str):
+    with tempfile.NamedTemporaryFile(suffix=".txt") as f:
+        f.write(s.encode("utf-8"))
+        f.seek(0)
+        yield f
+
+
+def _examples_to_tokens(
+    examples: Iterable[Example],
+) -> Tuple[List[List[str]], List[List[str]]]:
+    reference = []
+    predicted = []
+
+    for eg in examples:
+        reference.append([t.text for t in eg.reference])
+        predicted.append([t.text for t in eg.predicted])
+
+    return reference, predicted