mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	revert changes to corpus.py
This commit is contained in:
		
							parent
							
								
									350b8dd644
								
							
						
					
					
						commit
						75514c4bc4
					
				| 
						 | 
					@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Callable, Iterable, Iterator, List, Optional,
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..compat import Protocol
 | 
					 | 
				
			||||||
from ..errors import Errors, Warnings
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..tokens import Doc, DocBin
 | 
					from ..tokens import Doc, DocBin
 | 
				
			||||||
from ..vocab import Vocab
 | 
					from ..vocab import Vocab
 | 
				
			||||||
| 
						 | 
					@ -20,10 +19,6 @@ if TYPE_CHECKING:
 | 
				
			||||||
FILE_TYPE = ".spacy"
 | 
					FILE_TYPE = ".spacy"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ReaderProtocol(Protocol):
 | 
					 | 
				
			||||||
    def __call__(self, nlp: "Language") -> Iterable[Example]:
 | 
					 | 
				
			||||||
        pass
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
@util.registry.readers("spacy.Corpus.v1")
 | 
					@util.registry.readers("spacy.Corpus.v1")
 | 
				
			||||||
def create_docbin_reader(
 | 
					def create_docbin_reader(
 | 
				
			||||||
| 
						 | 
					@ -32,7 +27,7 @@ def create_docbin_reader(
 | 
				
			||||||
    max_length: int = 0,
 | 
					    max_length: int = 0,
 | 
				
			||||||
    limit: int = 0,
 | 
					    limit: int = 0,
 | 
				
			||||||
    augmenter: Optional[Callable] = None,
 | 
					    augmenter: Optional[Callable] = None,
 | 
				
			||||||
) -> ReaderProtocol:
 | 
					) -> Callable[["Language"], Iterable[Example]]:
 | 
				
			||||||
    if path is None:
 | 
					    if path is None:
 | 
				
			||||||
        raise ValueError(Errors.E913)
 | 
					        raise ValueError(Errors.E913)
 | 
				
			||||||
    util.logger.debug("Loading corpus from path: %s", path)
 | 
					    util.logger.debug("Loading corpus from path: %s", path)
 | 
				
			||||||
| 
						 | 
					@ -51,7 +46,7 @@ def create_jsonl_reader(
 | 
				
			||||||
    min_length: int = 0,
 | 
					    min_length: int = 0,
 | 
				
			||||||
    max_length: int = 0,
 | 
					    max_length: int = 0,
 | 
				
			||||||
    limit: int = 0,
 | 
					    limit: int = 0,
 | 
				
			||||||
) -> ReaderProtocol:
 | 
					) -> Callable[["Language"], Iterable[Example]]:
 | 
				
			||||||
    return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 | 
					    return JsonlCorpus(path, min_length=min_length, max_length=max_length, limit=limit)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,7 +64,7 @@ def create_plain_text_reader(
 | 
				
			||||||
    path: Optional[Path],
 | 
					    path: Optional[Path],
 | 
				
			||||||
    min_length: int = 0,
 | 
					    min_length: int = 0,
 | 
				
			||||||
    max_length: int = 0,
 | 
					    max_length: int = 0,
 | 
				
			||||||
) -> ReaderProtocol:
 | 
					) -> Callable[["Language"], Iterable[Example]]:
 | 
				
			||||||
    """Iterate Example objects from a file or directory of plain text
 | 
					    """Iterate Example objects from a file or directory of plain text
 | 
				
			||||||
    UTF-8 files with one line per doc.
 | 
					    UTF-8 files with one line per doc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -150,7 +145,7 @@ class Corpus:
 | 
				
			||||||
        self.augmenter = augmenter if augmenter is not None else dont_augment
 | 
					        self.augmenter = augmenter if augmenter is not None else dont_augment
 | 
				
			||||||
        self.shuffle = shuffle
 | 
					        self.shuffle = shuffle
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, nlp: "Language") -> Iterable[Example]:
 | 
					    def __call__(self, nlp: "Language") -> Iterator[Example]:
 | 
				
			||||||
        """Yield examples from the data.
 | 
					        """Yield examples from the data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        nlp (Language): The current nlp object.
 | 
					        nlp (Language): The current nlp object.
 | 
				
			||||||
| 
						 | 
					@ -188,7 +183,7 @@ class Corpus:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_examples(
 | 
					    def make_examples(
 | 
				
			||||||
        self, nlp: "Language", reference_docs: Iterable[Doc]
 | 
					        self, nlp: "Language", reference_docs: Iterable[Doc]
 | 
				
			||||||
    ) -> Iterable[Example]:
 | 
					    ) -> Iterator[Example]:
 | 
				
			||||||
        for reference in reference_docs:
 | 
					        for reference in reference_docs:
 | 
				
			||||||
            if len(reference) == 0:
 | 
					            if len(reference) == 0:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
| 
						 | 
					@ -203,7 +198,7 @@ class Corpus:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_examples_gold_preproc(
 | 
					    def make_examples_gold_preproc(
 | 
				
			||||||
        self, nlp: "Language", reference_docs: Iterable[Doc]
 | 
					        self, nlp: "Language", reference_docs: Iterable[Doc]
 | 
				
			||||||
    ) -> Iterable[Example]:
 | 
					    ) -> Iterator[Example]:
 | 
				
			||||||
        for reference in reference_docs:
 | 
					        for reference in reference_docs:
 | 
				
			||||||
            if reference.has_annotation("SENT_START"):
 | 
					            if reference.has_annotation("SENT_START"):
 | 
				
			||||||
                ref_sents = [sent.as_doc() for sent in reference.sents]
 | 
					                ref_sents = [sent.as_doc() for sent in reference.sents]
 | 
				
			||||||
| 
						 | 
					@ -263,7 +258,7 @@ class JsonlCorpus:
 | 
				
			||||||
        self.max_length = max_length
 | 
					        self.max_length = max_length
 | 
				
			||||||
        self.limit = limit
 | 
					        self.limit = limit
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, nlp: "Language") -> Iterable[Example]:
 | 
					    def __call__(self, nlp: "Language") -> Iterator[Example]:
 | 
				
			||||||
        """Yield examples from the data.
 | 
					        """Yield examples from the data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        nlp (Language): The current nlp object.
 | 
					        nlp (Language): The current nlp object.
 | 
				
			||||||
| 
						 | 
					@ -313,7 +308,7 @@ class PlainTextCorpus:
 | 
				
			||||||
        self.min_length = min_length
 | 
					        self.min_length = min_length
 | 
				
			||||||
        self.max_length = max_length
 | 
					        self.max_length = max_length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, nlp: "Language") -> Iterable[Example]:
 | 
					    def __call__(self, nlp: "Language") -> Iterator[Example]:
 | 
				
			||||||
        """Yield examples from the data.
 | 
					        """Yield examples from the data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        nlp (Language): The current nlp object.
 | 
					        nlp (Language): The current nlp object.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user