mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Make corpus paths default to None and improve errors
This commit is contained in:
		
							parent
							
								
									0250bcf6a3
								
							
						
					
					
						commit
						1aeef3bfbb
					
				| 
						 | 
					@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
 | 
				
			||||||
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
 | 
					{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
 | 
				
			||||||
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 | 
					{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 | 
				
			||||||
[paths]
 | 
					[paths]
 | 
				
			||||||
train = ""
 | 
					train = null
 | 
				
			||||||
dev = ""
 | 
					dev = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[system]
 | 
					[system]
 | 
				
			||||||
{% if use_transformer -%}
 | 
					{% if use_transformer -%}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -66,7 +66,7 @@ def init_pipeline(
 | 
				
			||||||
            nlp.to_disk(init_path)
 | 
					            nlp.to_disk(init_path)
 | 
				
			||||||
            msg.good(f"Saved initialized pipeline to {init_path}")
 | 
					            msg.good(f"Saved initialized pipeline to {init_path}")
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            nlp = util.load_model(init_path)
 | 
					            nlp = util.load_model(init_path).from_config(config)
 | 
				
			||||||
            if must_reinitialize(config, nlp.config):
 | 
					            if must_reinitialize(config, nlp.config):
 | 
				
			||||||
                msg.warn("Config has changed: need to re-initialize pipeline")
 | 
					                msg.warn("Config has changed: need to re-initialize pipeline")
 | 
				
			||||||
                nlp = init_nlp(config, **init_kwargs)
 | 
					                nlp = init_nlp(config, **init_kwargs)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
[paths]
 | 
					[paths]
 | 
				
			||||||
train = ""
 | 
					train = null
 | 
				
			||||||
dev = ""
 | 
					dev = null
 | 
				
			||||||
vectors = null
 | 
					vectors = null
 | 
				
			||||||
vocab_data = null
 | 
					vocab_data = null
 | 
				
			||||||
init_tok2vec = null
 | 
					init_tok2vec = null
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -477,6 +477,8 @@ class Errors:
 | 
				
			||||||
    E201 = ("Span index out of range.")
 | 
					    E201 = ("Span index out of range.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
 | 
				
			||||||
 | 
					            "config.cfg or override it on the CLI?")
 | 
				
			||||||
    E914 = ("Executing {name} callback failed. Expected the function to "
 | 
					    E914 = ("Executing {name} callback failed. Expected the function to "
 | 
				
			||||||
            "return the nlp object but got: {value}. Maybe you forgot to return "
 | 
					            "return the nlp object but got: {value}. Maybe you forgot to return "
 | 
				
			||||||
            "the modified object in your function?")
 | 
					            "the modified object in your function?")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,8 +14,8 @@ from ..util import make_tempdir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
nlp_config_string = """
 | 
					nlp_config_string = """
 | 
				
			||||||
[paths]
 | 
					[paths]
 | 
				
			||||||
train = ""
 | 
					train = null
 | 
				
			||||||
dev = ""
 | 
					dev = null
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[corpora]
 | 
					[corpora]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -309,7 +309,7 @@ def test_config_interpolation():
 | 
				
			||||||
    config = Config().from_str(nlp_config_string, interpolate=False)
 | 
					    config = Config().from_str(nlp_config_string, interpolate=False)
 | 
				
			||||||
    assert config["corpora"]["train"]["path"] == "${paths.train}"
 | 
					    assert config["corpora"]["train"]["path"] == "${paths.train}"
 | 
				
			||||||
    interpolated = config.interpolate()
 | 
					    interpolated = config.interpolate()
 | 
				
			||||||
    assert interpolated["corpora"]["train"]["path"] == ""
 | 
					    assert interpolated["corpora"]["train"]["path"] is None
 | 
				
			||||||
    nlp = English.from_config(config)
 | 
					    nlp = English.from_config(config)
 | 
				
			||||||
    assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
 | 
					    assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
 | 
				
			||||||
    # Ensure that variables are preserved in nlp config
 | 
					    # Ensure that variables are preserved in nlp config
 | 
				
			||||||
| 
						 | 
					@ -317,10 +317,10 @@ def test_config_interpolation():
 | 
				
			||||||
    assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
 | 
					    assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
 | 
				
			||||||
    assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
 | 
					    assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
 | 
				
			||||||
    interpolated2 = nlp.config.interpolate()
 | 
					    interpolated2 = nlp.config.interpolate()
 | 
				
			||||||
    assert interpolated2["corpora"]["train"]["path"] == ""
 | 
					    assert interpolated2["corpora"]["train"]["path"] is None
 | 
				
			||||||
    assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 | 
					    assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 | 
				
			||||||
    nlp2 = English.from_config(interpolated)
 | 
					    nlp2 = English.from_config(interpolated)
 | 
				
			||||||
    assert nlp2.config["corpora"]["train"]["path"] == ""
 | 
					    assert nlp2.config["corpora"]["train"]["path"] is None
 | 
				
			||||||
    assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 | 
					    assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,7 @@ import srsly
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from .augment import dont_augment
 | 
					from .augment import dont_augment
 | 
				
			||||||
from .example import Example
 | 
					from .example import Example
 | 
				
			||||||
from ..errors import Warnings
 | 
					from ..errors import Warnings, Errors
 | 
				
			||||||
from ..tokens import DocBin, Doc
 | 
					from ..tokens import DocBin, Doc
 | 
				
			||||||
from ..vocab import Vocab
 | 
					from ..vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@util.registry.readers("spacy.Corpus.v1")
 | 
					@util.registry.readers("spacy.Corpus.v1")
 | 
				
			||||||
def create_docbin_reader(
 | 
					def create_docbin_reader(
 | 
				
			||||||
    path: Path,
 | 
					    path: Optional[Path],
 | 
				
			||||||
    gold_preproc: bool,
 | 
					    gold_preproc: bool,
 | 
				
			||||||
    max_length: int = 0,
 | 
					    max_length: int = 0,
 | 
				
			||||||
    limit: int = 0,
 | 
					    limit: int = 0,
 | 
				
			||||||
    augmenter: Optional[Callable] = None,
 | 
					    augmenter: Optional[Callable] = None,
 | 
				
			||||||
) -> Callable[["Language"], Iterable[Example]]:
 | 
					) -> Callable[["Language"], Iterable[Example]]:
 | 
				
			||||||
 | 
					    if path is None:
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E913)
 | 
				
			||||||
    return Corpus(
 | 
					    return Corpus(
 | 
				
			||||||
        path,
 | 
					        path,
 | 
				
			||||||
        gold_preproc=gold_preproc,
 | 
					        gold_preproc=gold_preproc,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user