mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-30 20:06:30 +03:00
8d69874afb
* Add `spacy.PlainTextCorpusReader.v1` This is a corpus reader that reads plain text corpora with the following format: - UTF-8 encoding - One line per document. - Blank lines are ignored. It is useful for applications where we deal with very large corpora, such as distillation, and don't want to deal with the space overhead of serialized formats. Additionally, many large corpora already use such a text format, keeping the necessary preprocessing to a minimum. * Update spacy/training/corpus.py Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * docs: add version to `PlainTextCorpus` * Add docstring to registry function * Add plain text corpus tests * Only strip newline/carriage return * Add return type _string_to_tmp_file helper * Use a temporary directory in place of file name Different OS auto delete/sharing semantics are just wonky. * This will be new in 3.5.1 (rather than 4) * Test improvements from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
13 lines
831 B
Python
13 lines
831 B
Python
from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
|
|
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
|
from .alignment import Alignment # noqa: F401
|
|
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
|
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
|
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
|
from .iob_utils import split_bilu_label, remove_bilu_prefix # noqa: F401
|
|
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
|
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
|
from .loggers import console_logger # noqa: F401
|
|
from .callbacks import create_copy_from_base_model # noqa: F401
|