mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 12:20:20 +03:00
Merge branch 'master' into remove-dead-code
This commit is contained in:
commit
fb6acb5dfd
78
spacy/tests/training/test_corpus.py
Normal file
78
spacy/tests/training/test_corpus.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
from typing import IO, Generator, Iterable, List, TextIO, Tuple
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.training import Example, PlainTextCorpus
|
||||||
|
from spacy.util import make_tempdir
|
||||||
|
|
||||||
|
# Intentional newlines to check that they are skipped.
|
||||||
|
PLAIN_TEXT_DOC = """
|
||||||
|
|
||||||
|
This is a doc. It contains two sentences.
|
||||||
|
This is another doc.
|
||||||
|
|
||||||
|
A third doc.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
PLAIN_TEXT_DOC_TOKENIZED = [
|
||||||
|
[
|
||||||
|
"This",
|
||||||
|
"is",
|
||||||
|
"a",
|
||||||
|
"doc",
|
||||||
|
".",
|
||||||
|
"It",
|
||||||
|
"contains",
|
||||||
|
"two",
|
||||||
|
"sentences",
|
||||||
|
".",
|
||||||
|
],
|
||||||
|
["This", "is", "another", "doc", "."],
|
||||||
|
["A", "third", "doc", "."],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("min_length", [0, 5])
|
||||||
|
@pytest.mark.parametrize("max_length", [0, 5])
|
||||||
|
def test_plain_text_reader(min_length, max_length):
|
||||||
|
nlp = English()
|
||||||
|
with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path:
|
||||||
|
corpus = PlainTextCorpus(
|
||||||
|
file_path, min_length=min_length, max_length=max_length
|
||||||
|
)
|
||||||
|
|
||||||
|
check = [
|
||||||
|
doc
|
||||||
|
for doc in PLAIN_TEXT_DOC_TOKENIZED
|
||||||
|
if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
|
||||||
|
]
|
||||||
|
reference, predicted = _examples_to_tokens(corpus(nlp))
|
||||||
|
|
||||||
|
assert reference == check
|
||||||
|
assert predicted == check
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _string_to_tmp_file(s: str) -> Generator[Path, None, None]:
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = Path(d) / "string.txt"
|
||||||
|
with open(file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(s)
|
||||||
|
yield file_path
|
||||||
|
|
||||||
|
|
||||||
|
def _examples_to_tokens(
|
||||||
|
examples: Iterable[Example],
|
||||||
|
) -> Tuple[List[List[str]], List[List[str]]]:
|
||||||
|
reference = []
|
||||||
|
predicted = []
|
||||||
|
|
||||||
|
for eg in examples:
|
||||||
|
reference.append([t.text for t in eg.reference])
|
||||||
|
predicted.append([t.text for t in eg.predicted])
|
||||||
|
|
||||||
|
return reference, predicted
|
|
@ -1,4 +1,4 @@
|
||||||
from .corpus import Corpus, JsonlCorpus # noqa: F401
|
from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
|
||||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||||
from .alignment import Alignment # noqa: F401
|
from .alignment import Alignment # noqa: F401
|
||||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||||
|
|
|
@ -58,6 +58,28 @@ def read_labels(path: Path, *, require: bool = False):
|
||||||
return srsly.read_json(path)
|
return srsly.read_json(path)
|
||||||
|
|
||||||
|
|
||||||
|
@util.registry.readers("spacy.PlainTextCorpus.v1")
|
||||||
|
def create_plain_text_reader(
|
||||||
|
path: Optional[Path],
|
||||||
|
min_length: int = 0,
|
||||||
|
max_length: int = 0,
|
||||||
|
) -> Callable[["Language"], Iterable[Doc]]:
|
||||||
|
"""Iterate Example objects from a file or directory of plain text
|
||||||
|
UTF-8 files with one line per doc.
|
||||||
|
|
||||||
|
path (Path): The directory or filename to read from.
|
||||||
|
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||||
|
will be skipped. Defaults to 0, which indicates no limit.
|
||||||
|
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||||
|
be skipped. Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#plaintextcorpus
|
||||||
|
"""
|
||||||
|
if path is None:
|
||||||
|
raise ValueError(Errors.E913)
|
||||||
|
return PlainTextCorpus(path, min_length=min_length, max_length=max_length)
|
||||||
|
|
||||||
|
|
||||||
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.is_dir() and path.parts[-1].endswith(file_type):
|
if not path.is_dir() and path.parts[-1].endswith(file_type):
|
||||||
|
@ -257,3 +279,52 @@ class JsonlCorpus:
|
||||||
# We don't *need* an example here, but it seems nice to
|
# We don't *need* an example here, but it seems nice to
|
||||||
# make it match the Corpus signature.
|
# make it match the Corpus signature.
|
||||||
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
|
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
|
||||||
|
|
||||||
|
|
||||||
|
class PlainTextCorpus:
|
||||||
|
"""Iterate Example objects from a file or directory of plain text
|
||||||
|
UTF-8 files with one line per doc.
|
||||||
|
|
||||||
|
path (Path): The directory or filename to read from.
|
||||||
|
min_length (int): Minimum document length (in tokens). Shorter documents
|
||||||
|
will be skipped. Defaults to 0, which indicates no limit.
|
||||||
|
max_length (int): Maximum document length (in tokens). Longer documents will
|
||||||
|
be skipped. Defaults to 0, which indicates no limit.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#plaintextcorpus
|
||||||
|
"""
|
||||||
|
|
||||||
|
file_type = "txt"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
path: Optional[Union[str, Path]],
|
||||||
|
*,
|
||||||
|
min_length: int = 0,
|
||||||
|
max_length: int = 0,
|
||||||
|
) -> None:
|
||||||
|
self.path = util.ensure_path(path)
|
||||||
|
self.min_length = min_length
|
||||||
|
self.max_length = max_length
|
||||||
|
|
||||||
|
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||||
|
"""Yield examples from the data.
|
||||||
|
|
||||||
|
nlp (Language): The current nlp object.
|
||||||
|
YIELDS (Example): The example objects.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
|
||||||
|
"""
|
||||||
|
for loc in walk_corpus(self.path, ".txt"):
|
||||||
|
with open(loc, encoding="utf-8") as f:
|
||||||
|
for text in f:
|
||||||
|
text = text.rstrip("\r\n")
|
||||||
|
if len(text):
|
||||||
|
doc = nlp.make_doc(text)
|
||||||
|
if self.min_length >= 1 and len(doc) < self.min_length:
|
||||||
|
continue
|
||||||
|
elif self.max_length >= 1 and len(doc) > self.max_length:
|
||||||
|
continue
|
||||||
|
# We don't *need* an example here, but it seems nice to
|
||||||
|
# make it match the Corpus signature.
|
||||||
|
yield Example(doc, doc.copy())
|
||||||
|
|
|
@ -175,3 +175,68 @@ Yield examples from the data.
|
||||||
| ---------- | -------------------------------------- |
|
| ---------- | -------------------------------------- |
|
||||||
| `nlp` | The current `nlp` object. ~~Language~~ |
|
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||||
| **YIELDS** | The examples. ~~Example~~ |
|
| **YIELDS** | The examples. ~~Example~~ |
|
||||||
|
|
||||||
|
## PlainTextCorpus {id="plaintextcorpus",tag="class",version="3.5.1"}
|
||||||
|
|
||||||
|
Iterate over documents from a plain text file. Can be used to read the raw text
|
||||||
|
corpus for language model
|
||||||
|
[pretraining](/usage/embeddings-transformers#pretraining). The expected file
|
||||||
|
format is:
|
||||||
|
|
||||||
|
- UTF-8 encoding
|
||||||
|
- One document per line
|
||||||
|
- Blank lines are ignored.
|
||||||
|
|
||||||
|
```text {title="Example"}
|
||||||
|
Can I ask where you work now and what you do, and if you enjoy it?
|
||||||
|
They may just pull out of the Seattle market completely, at least until they have autonomous vehicles.
|
||||||
|
My cynical view on this is that it will never be free to the public. Reason: what would be the draw of joining the military? Right now their selling point is free Healthcare and Education. Ironically both are run horribly and most, that I've talked to, come out wishing they never went in.
|
||||||
|
```
|
||||||
|
|
||||||
|
### PlainTextCorpus.\_\_init\_\_ {id="plaintextcorpus-init",tag="method"}
|
||||||
|
|
||||||
|
Initialize the reader.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.training import PlainTextCorpus
|
||||||
|
>
|
||||||
|
> corpus = PlainTextCorpus("./data/docs.txt")
|
||||||
|
> ```
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> ### Example config
|
||||||
|
> [corpora.pretrain]
|
||||||
|
> @readers = "spacy.PlainTextCorpus.v1"
|
||||||
|
> path = "corpus/raw_text.txt"
|
||||||
|
> min_length = 0
|
||||||
|
> max_length = 0
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `path` | The directory or filename to read from. Expects newline-delimited documents in UTF8 format. ~~Union[str, Path]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
|
| `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~ |
|
||||||
|
|
||||||
|
### PlainTextCorpus.\_\_call\_\_ {id="plaintextcorpus-call",tag="method"}
|
||||||
|
|
||||||
|
Yield examples from the data.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.training import PlainTextCorpus
|
||||||
|
> import spacy
|
||||||
|
>
|
||||||
|
> corpus = PlainTextCorpus("./docs.txt")
|
||||||
|
> nlp = spacy.blank("en")
|
||||||
|
> data = corpus(nlp)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------- | -------------------------------------- |
|
||||||
|
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||||
|
| **YIELDS** | The examples. ~~Example~~ |
|
||||||
|
|
|
@ -77,7 +77,7 @@ $border-radius: 6px
|
||||||
padding: 1.5rem 2.5rem 2.5rem 2rem
|
padding: 1.5rem 2.5rem 2.5rem 2rem
|
||||||
|
|
||||||
a, a:hover
|
a, a:hover
|
||||||
color: var(--color-subtle)
|
color: var(--color-subtle-on-dark)
|
||||||
|
|
||||||
& > *:last-child
|
& > *:last-child
|
||||||
margin-bottom: 0
|
margin-bottom: 0
|
||||||
|
|
|
@ -195,7 +195,7 @@
|
||||||
position: absolute
|
position: absolute
|
||||||
|
|
||||||
.menu
|
.menu
|
||||||
color: var(--color-subtle)
|
color: var(--color-subtle-on-dark)
|
||||||
padding-right: 1.5rem
|
padding-right: 1.5rem
|
||||||
display: inline-block
|
display: inline-block
|
||||||
position: absolute
|
position: absolute
|
||||||
|
|
Loading…
Reference in New Issue
Block a user