mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						5bed6fc431
					
				| 
						 | 
					@ -78,8 +78,7 @@ def read_data(
 | 
				
			||||||
                head = int(head) - 1 if head != "0" else id_
 | 
					                head = int(head) - 1 if head != "0" else id_
 | 
				
			||||||
                sent["words"].append(word)
 | 
					                sent["words"].append(word)
 | 
				
			||||||
                sent["tags"].append(tag)
 | 
					                sent["tags"].append(tag)
 | 
				
			||||||
                sent["morphology"].append(_parse_morph_string(morph))
 | 
					                sent["morphs"].append(_compile_morph_string(morph, pos))
 | 
				
			||||||
                sent["morphology"][-1].add("POS_%s" % pos)
 | 
					 | 
				
			||||||
                sent["heads"].append(head)
 | 
					                sent["heads"].append(head)
 | 
				
			||||||
                sent["deps"].append("ROOT" if dep == "root" else dep)
 | 
					                sent["deps"].append("ROOT" if dep == "root" else dep)
 | 
				
			||||||
                sent["spaces"].append(space_after == "_")
 | 
					                sent["spaces"].append(space_after == "_")
 | 
				
			||||||
| 
						 | 
					@ -88,12 +87,12 @@ def read_data(
 | 
				
			||||||
            if oracle_segments:
 | 
					            if oracle_segments:
 | 
				
			||||||
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
 | 
					                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
 | 
				
			||||||
                golds.append(sent)
 | 
					                golds.append(sent)
 | 
				
			||||||
                assert golds[-1].morphology is not None
 | 
					                assert golds[-1]["morphs"] is not None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            sent_annots.append(sent)
 | 
					            sent_annots.append(sent)
 | 
				
			||||||
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
 | 
					            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
 | 
				
			||||||
                doc, gold = _make_gold(nlp, None, sent_annots)
 | 
					                doc, gold = _make_gold(nlp, None, sent_annots)
 | 
				
			||||||
                assert gold.morphology is not None
 | 
					                assert gold["morphs"] is not None
 | 
				
			||||||
                sent_annots = []
 | 
					                sent_annots = []
 | 
				
			||||||
                docs.append(doc)
 | 
					                docs.append(doc)
 | 
				
			||||||
                golds.append(gold)
 | 
					                golds.append(gold)
 | 
				
			||||||
| 
						 | 
					@ -109,17 +108,10 @@ def read_data(
 | 
				
			||||||
    return golds_to_gold_data(docs, golds)
 | 
					    return golds_to_gold_data(docs, golds)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _parse_morph_string(morph_string):
 | 
					def _compile_morph_string(morph_string, pos):
 | 
				
			||||||
    if morph_string == '_':
 | 
					    if morph_string == '_':
 | 
				
			||||||
        return set()
 | 
					        return f"POS={pos}"
 | 
				
			||||||
    output = []
 | 
					    return morph_string + f"|POS={pos}"
 | 
				
			||||||
    replacements = {'1': 'one', '2': 'two', '3': 'three'}
 | 
					 | 
				
			||||||
    for feature in morph_string.split('|'):
 | 
					 | 
				
			||||||
        key, value = feature.split('=')
 | 
					 | 
				
			||||||
        value = replacements.get(value, value)
 | 
					 | 
				
			||||||
        value = value.split(',')[0]
 | 
					 | 
				
			||||||
        output.append('%s_%s' % (key, value.lower()))
 | 
					 | 
				
			||||||
    return set(output)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def read_conllu(file_):
 | 
					def read_conllu(file_):
 | 
				
			||||||
| 
						 | 
					@ -155,7 +147,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 | 
				
			||||||
    sent_starts = []
 | 
					    sent_starts = []
 | 
				
			||||||
    for sent in sent_annots:
 | 
					    for sent in sent_annots:
 | 
				
			||||||
        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
 | 
					        gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
 | 
				
			||||||
        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
 | 
					        for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]:
 | 
				
			||||||
            gold[field].extend(sent[field])
 | 
					            gold[field].extend(sent[field])
 | 
				
			||||||
        sent_starts.append(True)
 | 
					        sent_starts.append(True)
 | 
				
			||||||
        sent_starts.extend([False] * (len(sent["words"]) - 1))
 | 
					        sent_starts.extend([False] * (len(sent["words"]) - 1))
 | 
				
			||||||
| 
						 | 
					@ -168,7 +160,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
 | 
				
			||||||
    doc = nlp.make_doc(text)
 | 
					    doc = nlp.make_doc(text)
 | 
				
			||||||
    gold.pop("spaces")
 | 
					    gold.pop("spaces")
 | 
				
			||||||
    gold["sent_starts"] = sent_starts
 | 
					    gold["sent_starts"] = sent_starts
 | 
				
			||||||
    for i in range(len(gold.heads)):
 | 
					    for i in range(len(gold["heads"])):
 | 
				
			||||||
        if random.random() < drop_deps:
 | 
					        if random.random() < drop_deps:
 | 
				
			||||||
            gold["heads"][i] = None
 | 
					            gold["heads"][i] = None
 | 
				
			||||||
            gold["labels"][i] = None
 | 
					            gold["labels"][i] = None
 | 
				
			||||||
| 
						 | 
					@ -185,7 +177,7 @@ def golds_to_gold_data(docs, golds):
 | 
				
			||||||
    """Get out the training data format used by begin_training"""
 | 
					    """Get out the training data format used by begin_training"""
 | 
				
			||||||
    data = []
 | 
					    data = []
 | 
				
			||||||
    for doc, gold in zip(docs, golds):
 | 
					    for doc, gold in zip(docs, golds):
 | 
				
			||||||
        example = Example.from_dict(doc, gold)
 | 
					        example = Example.from_dict(doc, dict(gold))
 | 
				
			||||||
        data.append(example)
 | 
					        data.append(example)
 | 
				
			||||||
    return data
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -354,8 +346,7 @@ def initialize_pipeline(nlp, examples, config, device):
 | 
				
			||||||
    if config.multitask_sent:
 | 
					    if config.multitask_sent:
 | 
				
			||||||
        nlp.parser.add_multitask_objective("sent_start")
 | 
					        nlp.parser.add_multitask_objective("sent_start")
 | 
				
			||||||
    for eg in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        gold = eg.gold
 | 
					        for tag in eg.get_aligned("TAG", as_string=True):
 | 
				
			||||||
        for tag in gold.tags:
 | 
					 | 
				
			||||||
            if tag is not None:
 | 
					            if tag is not None:
 | 
				
			||||||
                nlp.tagger.add_label(tag)
 | 
					                nlp.tagger.add_label(tag)
 | 
				
			||||||
    if torch is not None and device != -1:
 | 
					    if torch is not None and device != -1:
 | 
				
			||||||
| 
						 | 
					@ -489,10 +480,6 @@ def main(
 | 
				
			||||||
    Token.set_extension("begins_fused", default=False)
 | 
					    Token.set_extension("begins_fused", default=False)
 | 
				
			||||||
    Token.set_extension("inside_fused", default=False)
 | 
					    Token.set_extension("inside_fused", default=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Token.set_extension("get_conllu_lines", method=get_token_conllu)
 | 
					 | 
				
			||||||
    Token.set_extension("begins_fused", default=False)
 | 
					 | 
				
			||||||
    Token.set_extension("inside_fused", default=False)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    spacy.util.fix_random_seed()
 | 
					    spacy.util.fix_random_seed()
 | 
				
			||||||
    lang.zh.Chinese.Defaults.use_jieba = False
 | 
					    lang.zh.Chinese.Defaults.use_jieba = False
 | 
				
			||||||
    lang.ja.Japanese.Defaults.use_janome = False
 | 
					    lang.ja.Japanese.Defaults.use_janome = False
 | 
				
			||||||
| 
						 | 
					@ -535,10 +522,10 @@ def main(
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            batches = minibatch(examples, size=batch_sizes)
 | 
					            batches = minibatch(examples, size=batch_sizes)
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        n_train_words = sum(len(eg.doc) for eg in examples)
 | 
					        n_train_words = sum(len(eg.predicted) for eg in examples)
 | 
				
			||||||
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
					        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
 | 
				
			||||||
            for batch in batches:
 | 
					            for batch in batches:
 | 
				
			||||||
                pbar.update(sum(len(ex.doc) for ex in batch))
 | 
					                pbar.update(sum(len(ex.predicted) for ex in batch))
 | 
				
			||||||
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
 | 
					                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
 | 
				
			||||||
                nlp.update(
 | 
					                nlp.update(
 | 
				
			||||||
                    batch,
 | 
					                    batch,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -283,7 +283,7 @@ def initialize_pipeline(nlp, examples, config):
 | 
				
			||||||
    nlp.parser.moves.add_action(2, "subtok")
 | 
					    nlp.parser.moves.add_action(2, "subtok")
 | 
				
			||||||
    nlp.add_pipe(nlp.create_pipe("tagger"))
 | 
					    nlp.add_pipe(nlp.create_pipe("tagger"))
 | 
				
			||||||
    for eg in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        for tag in eg.gold.tags:
 | 
					        for tag in eg.get_aligned("TAG", as_string=True):
 | 
				
			||||||
            if tag is not None:
 | 
					            if tag is not None:
 | 
				
			||||||
                nlp.tagger.add_label(tag)
 | 
					                nlp.tagger.add_label(tag)
 | 
				
			||||||
    # Replace labels that didn't make the frequency cutoff
 | 
					    # Replace labels that didn't make the frequency cutoff
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -56,7 +56,7 @@ def main(model=None, output_dir=None, n_iter=100):
 | 
				
			||||||
            print("Add label", ent[2])
 | 
					            print("Add label", ent[2])
 | 
				
			||||||
            ner.add_label(ent[2])
 | 
					            ner.add_label(ent[2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
 | 
					    with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
 | 
				
			||||||
        # show warnings for misaligned entity spans once
 | 
					        # show warnings for misaligned entity spans once
 | 
				
			||||||
        warnings.filterwarnings("once", category=UserWarning, module="spacy")
 | 
					        warnings.filterwarnings("once", category=UserWarning, module="spacy")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ requires = [
 | 
				
			||||||
    "cymem>=2.0.2,<2.1.0",
 | 
					    "cymem>=2.0.2,<2.1.0",
 | 
				
			||||||
    "preshed>=3.0.2,<3.1.0",
 | 
					    "preshed>=3.0.2,<3.1.0",
 | 
				
			||||||
    "murmurhash>=0.28.0,<1.1.0",
 | 
					    "murmurhash>=0.28.0,<1.1.0",
 | 
				
			||||||
    "thinc==8.0.0a9",
 | 
					    "thinc==8.0.0a11",
 | 
				
			||||||
    "blis>=0.4.0,<0.5.0"
 | 
					    "blis>=0.4.0,<0.5.0"
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
build-backend = "setuptools.build_meta"
 | 
					build-backend = "setuptools.build_meta"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "3.0.0.dev10"
 | 
					__version__ = "3.0.0.dev12"
 | 
				
			||||||
__release__ = True
 | 
					__release__ = True
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,8 @@ from .evaluate import evaluate  # noqa: F401
 | 
				
			||||||
from .convert import convert  # noqa: F401
 | 
					from .convert import convert  # noqa: F401
 | 
				
			||||||
from .init_model import init_model  # noqa: F401
 | 
					from .init_model import init_model  # noqa: F401
 | 
				
			||||||
from .validate import validate  # noqa: F401
 | 
					from .validate import validate  # noqa: F401
 | 
				
			||||||
 | 
					from .project import project_clone, project_assets, project_run  # noqa: F401
 | 
				
			||||||
 | 
					from .project import project_run_all  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
					@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
import typer
 | 
					import typer
 | 
				
			||||||
from typer.main import get_command
 | 
					from typer.main import get_command
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -102,9 +102,6 @@ def debug_data(
 | 
				
			||||||
        corpus = Corpus(train_path, dev_path)
 | 
					        corpus = Corpus(train_path, dev_path)
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            train_dataset = list(corpus.train_dataset(nlp))
 | 
					            train_dataset = list(corpus.train_dataset(nlp))
 | 
				
			||||||
            train_dataset_unpreprocessed = list(
 | 
					 | 
				
			||||||
                corpus.train_dataset_without_preprocessing(nlp)
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        except ValueError as e:
 | 
					        except ValueError as e:
 | 
				
			||||||
            loading_train_error_message = f"Training data cannot be loaded: {e}"
 | 
					            loading_train_error_message = f"Training data cannot be loaded: {e}"
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
| 
						 | 
					@ -120,11 +117,9 @@ def debug_data(
 | 
				
			||||||
    msg.good("Corpus is loadable")
 | 
					    msg.good("Corpus is loadable")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create all gold data here to avoid iterating over the train_dataset constantly
 | 
					    # Create all gold data here to avoid iterating over the train_dataset constantly
 | 
				
			||||||
    gold_train_data = _compile_gold(train_dataset, pipeline, nlp)
 | 
					    gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
 | 
				
			||||||
    gold_train_unpreprocessed_data = _compile_gold(
 | 
					    gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False)
 | 
				
			||||||
        train_dataset_unpreprocessed, pipeline
 | 
					    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    train_texts = gold_train_data["texts"]
 | 
					    train_texts = gold_train_data["texts"]
 | 
				
			||||||
    dev_texts = gold_dev_data["texts"]
 | 
					    dev_texts = gold_dev_data["texts"]
 | 
				
			||||||
| 
						 | 
					@ -497,7 +492,7 @@ def _load_file(file_path: Path, msg: Printer) -> None:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _compile_gold(
 | 
					def _compile_gold(
 | 
				
			||||||
    examples: Sequence[Example], pipeline: List[str], nlp: Language
 | 
					    examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					) -> Dict[str, Any]:
 | 
				
			||||||
    data = {
 | 
					    data = {
 | 
				
			||||||
        "ner": Counter(),
 | 
					        "ner": Counter(),
 | 
				
			||||||
| 
						 | 
					@ -517,9 +512,9 @@ def _compile_gold(
 | 
				
			||||||
        "n_cats_multilabel": 0,
 | 
					        "n_cats_multilabel": 0,
 | 
				
			||||||
        "texts": set(),
 | 
					        "texts": set(),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    for example in examples:
 | 
					    for eg in examples:
 | 
				
			||||||
        gold = example.reference
 | 
					        gold = eg.reference
 | 
				
			||||||
        doc = example.predicted
 | 
					        doc = eg.predicted
 | 
				
			||||||
        valid_words = [x for x in gold if x is not None]
 | 
					        valid_words = [x for x in gold if x is not None]
 | 
				
			||||||
        data["words"].update(valid_words)
 | 
					        data["words"].update(valid_words)
 | 
				
			||||||
        data["n_words"] += len(valid_words)
 | 
					        data["n_words"] += len(valid_words)
 | 
				
			||||||
| 
						 | 
					@ -530,7 +525,7 @@ def _compile_gold(
 | 
				
			||||||
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
 | 
					                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
 | 
				
			||||||
                    data["words_missing_vectors"].update([word])
 | 
					                    data["words_missing_vectors"].update([word])
 | 
				
			||||||
        if "ner" in pipeline:
 | 
					        if "ner" in pipeline:
 | 
				
			||||||
            for i, label in enumerate(gold.ner):
 | 
					            for i, label in enumerate(eg.get_aligned_ner()):
 | 
				
			||||||
                if label is None:
 | 
					                if label is None:
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
 | 
					                if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
 | 
				
			||||||
| 
						 | 
					@ -556,16 +551,18 @@ def _compile_gold(
 | 
				
			||||||
            if list(gold.cats.values()).count(1.0) != 1:
 | 
					            if list(gold.cats.values()).count(1.0) != 1:
 | 
				
			||||||
                data["n_cats_multilabel"] += 1
 | 
					                data["n_cats_multilabel"] += 1
 | 
				
			||||||
        if "tagger" in pipeline:
 | 
					        if "tagger" in pipeline:
 | 
				
			||||||
            data["tags"].update([x for x in gold.tags if x is not None])
 | 
					            tags = eg.get_aligned("TAG", as_string=True)
 | 
				
			||||||
 | 
					            data["tags"].update([x for x in tags if x is not None])
 | 
				
			||||||
        if "parser" in pipeline:
 | 
					        if "parser" in pipeline:
 | 
				
			||||||
            data["deps"].update([x for x in gold.labels if x is not None])
 | 
					            aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
 | 
				
			||||||
            for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)):
 | 
					            data["deps"].update([x for x in aligned_deps if x is not None])
 | 
				
			||||||
 | 
					            for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
 | 
				
			||||||
                if head == i:
 | 
					                if head == i:
 | 
				
			||||||
                    data["roots"].update([dep])
 | 
					                    data["roots"].update([dep])
 | 
				
			||||||
                    data["n_sents"] += 1
 | 
					                    data["n_sents"] += 1
 | 
				
			||||||
            if nonproj.is_nonproj_tree(gold.heads):
 | 
					            if nonproj.is_nonproj_tree(aligned_heads):
 | 
				
			||||||
                data["n_nonproj"] += 1
 | 
					                data["n_nonproj"] += 1
 | 
				
			||||||
            if nonproj.contains_cycle(gold.heads):
 | 
					            if nonproj.contains_cycle(aligned_heads):
 | 
				
			||||||
                data["n_cycles"] += 1
 | 
					                data["n_cycles"] += 1
 | 
				
			||||||
    return data
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -581,7 +578,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
 | 
				
			||||||
    for eg in data:
 | 
					    for eg in data:
 | 
				
			||||||
        labels = [
 | 
					        labels = [
 | 
				
			||||||
            label.split("-")[1]
 | 
					            label.split("-")[1]
 | 
				
			||||||
            for label in eg.gold.ner
 | 
					            for label in eg.get_aligned_ner()
 | 
				
			||||||
            if label not in ("O", "-", None)
 | 
					            if label not in ("O", "-", None)
 | 
				
			||||||
        ]
 | 
					        ]
 | 
				
			||||||
        if label not in labels:
 | 
					        if label not in labels:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,9 @@
 | 
				
			||||||
from typing import Optional, List
 | 
					from typing import Optional, List, Dict
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
from wasabi import Printer
 | 
					from wasabi import Printer
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..gold import Corpus
 | 
					from ..gold import Corpus
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
| 
						 | 
					@ -16,12 +18,11 @@ def evaluate_cli(
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
    model: str = Arg(..., help="Model name or path"),
 | 
					    model: str = Arg(..., help="Model name or path"),
 | 
				
			||||||
    data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
 | 
					    data_path: Path = Arg(..., help="Location of JSON-formatted evaluation data", exists=True),
 | 
				
			||||||
 | 
					    output: Optional[Path] = Opt(None, "--output", "-o", help="Output JSON file for metrics", dir_okay=False),
 | 
				
			||||||
    gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
 | 
					    gpu_id: int = Opt(-1, "--gpu-id", "-g", help="Use GPU"),
 | 
				
			||||||
    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
					    gold_preproc: bool = Opt(False, "--gold-preproc", "-G", help="Use gold preprocessing"),
 | 
				
			||||||
    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
					    displacy_path: Optional[Path] = Opt(None, "--displacy-path", "-dp", help="Directory to output rendered parses as HTML", exists=True, file_okay=False),
 | 
				
			||||||
    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
					    displacy_limit: int = Opt(25, "--displacy-limit", "-dl", help="Limit of parses to render as HTML"),
 | 
				
			||||||
    return_scores: bool = Opt(False, "--return-scores", "-R", help="Return dict containing model scores"),
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -31,24 +32,24 @@ def evaluate_cli(
 | 
				
			||||||
    evaluate(
 | 
					    evaluate(
 | 
				
			||||||
        model,
 | 
					        model,
 | 
				
			||||||
        data_path,
 | 
					        data_path,
 | 
				
			||||||
 | 
					        output=output,
 | 
				
			||||||
        gpu_id=gpu_id,
 | 
					        gpu_id=gpu_id,
 | 
				
			||||||
        gold_preproc=gold_preproc,
 | 
					        gold_preproc=gold_preproc,
 | 
				
			||||||
        displacy_path=displacy_path,
 | 
					        displacy_path=displacy_path,
 | 
				
			||||||
        displacy_limit=displacy_limit,
 | 
					        displacy_limit=displacy_limit,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
        return_scores=return_scores,
 | 
					 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def evaluate(
 | 
					def evaluate(
 | 
				
			||||||
    model: str,
 | 
					    model: str,
 | 
				
			||||||
    data_path: Path,
 | 
					    data_path: Path,
 | 
				
			||||||
 | 
					    output: Optional[Path],
 | 
				
			||||||
    gpu_id: int = -1,
 | 
					    gpu_id: int = -1,
 | 
				
			||||||
    gold_preproc: bool = False,
 | 
					    gold_preproc: bool = False,
 | 
				
			||||||
    displacy_path: Optional[Path] = None,
 | 
					    displacy_path: Optional[Path] = None,
 | 
				
			||||||
    displacy_limit: int = 25,
 | 
					    displacy_limit: int = 25,
 | 
				
			||||||
    silent: bool = True,
 | 
					    silent: bool = True,
 | 
				
			||||||
    return_scores: bool = False,
 | 
					 | 
				
			||||||
) -> Scorer:
 | 
					) -> Scorer:
 | 
				
			||||||
    msg = Printer(no_print=silent, pretty=not silent)
 | 
					    msg = Printer(no_print=silent, pretty=not silent)
 | 
				
			||||||
    util.fix_random_seed()
 | 
					    util.fix_random_seed()
 | 
				
			||||||
| 
						 | 
					@ -56,21 +57,19 @@ def evaluate(
 | 
				
			||||||
        util.use_gpu(gpu_id)
 | 
					        util.use_gpu(gpu_id)
 | 
				
			||||||
    util.set_env_log(False)
 | 
					    util.set_env_log(False)
 | 
				
			||||||
    data_path = util.ensure_path(data_path)
 | 
					    data_path = util.ensure_path(data_path)
 | 
				
			||||||
 | 
					    output_path = util.ensure_path(output)
 | 
				
			||||||
    displacy_path = util.ensure_path(displacy_path)
 | 
					    displacy_path = util.ensure_path(displacy_path)
 | 
				
			||||||
    if not data_path.exists():
 | 
					    if not data_path.exists():
 | 
				
			||||||
        msg.fail("Evaluation data not found", data_path, exits=1)
 | 
					        msg.fail("Evaluation data not found", data_path, exits=1)
 | 
				
			||||||
    if displacy_path and not displacy_path.exists():
 | 
					    if displacy_path and not displacy_path.exists():
 | 
				
			||||||
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
					        msg.fail("Visualization output directory not found", displacy_path, exits=1)
 | 
				
			||||||
    corpus = Corpus(data_path, data_path)
 | 
					    corpus = Corpus(data_path, data_path)
 | 
				
			||||||
    if model.startswith("blank:"):
 | 
					 | 
				
			||||||
        nlp = util.get_lang_class(model.replace("blank:", ""))()
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
    nlp = util.load_model(model)
 | 
					    nlp = util.load_model(model)
 | 
				
			||||||
    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
 | 
					    dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
 | 
				
			||||||
    begin = timer()
 | 
					    begin = timer()
 | 
				
			||||||
    scorer = nlp.evaluate(dev_dataset, verbose=False)
 | 
					    scorer = nlp.evaluate(dev_dataset, verbose=False)
 | 
				
			||||||
    end = timer()
 | 
					    end = timer()
 | 
				
			||||||
    nwords = sum(len(ex.doc) for ex in dev_dataset)
 | 
					    nwords = sum(len(ex.predicted) for ex in dev_dataset)
 | 
				
			||||||
    results = {
 | 
					    results = {
 | 
				
			||||||
        "Time": f"{end - begin:.2f} s",
 | 
					        "Time": f"{end - begin:.2f} s",
 | 
				
			||||||
        "Words": nwords,
 | 
					        "Words": nwords,
 | 
				
			||||||
| 
						 | 
					@ -90,10 +89,22 @@ def evaluate(
 | 
				
			||||||
        "Sent R": f"{scorer.sent_r:.2f}",
 | 
					        "Sent R": f"{scorer.sent_r:.2f}",
 | 
				
			||||||
        "Sent F": f"{scorer.sent_f:.2f}",
 | 
					        "Sent F": f"{scorer.sent_f:.2f}",
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    msg.table(results, title="Results")
 | 
					    msg.table(results, title="Results")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if scorer.ents_per_type:
 | 
				
			||||||
 | 
					        data["ents_per_type"] = scorer.ents_per_type
 | 
				
			||||||
 | 
					        print_ents_per_type(msg, scorer.ents_per_type)
 | 
				
			||||||
 | 
					    if scorer.textcats_f_per_cat:
 | 
				
			||||||
 | 
					        data["textcats_f_per_cat"] = scorer.textcats_f_per_cat
 | 
				
			||||||
 | 
					        print_textcats_f_per_cat(msg, scorer.textcats_f_per_cat)
 | 
				
			||||||
 | 
					    if scorer.textcats_auc_per_cat:
 | 
				
			||||||
 | 
					        data["textcats_auc_per_cat"] = scorer.textcats_auc_per_cat
 | 
				
			||||||
 | 
					        print_textcats_auc_per_cat(msg, scorer.textcats_auc_per_cat)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if displacy_path:
 | 
					    if displacy_path:
 | 
				
			||||||
        docs = [ex.doc for ex in dev_dataset]
 | 
					        docs = [ex.predicted for ex in dev_dataset]
 | 
				
			||||||
        render_deps = "parser" in nlp.meta.get("pipeline", [])
 | 
					        render_deps = "parser" in nlp.meta.get("pipeline", [])
 | 
				
			||||||
        render_ents = "ner" in nlp.meta.get("pipeline", [])
 | 
					        render_ents = "ner" in nlp.meta.get("pipeline", [])
 | 
				
			||||||
        render_parses(
 | 
					        render_parses(
 | 
				
			||||||
| 
						 | 
					@ -105,8 +116,11 @@ def evaluate(
 | 
				
			||||||
            ents=render_ents,
 | 
					            ents=render_ents,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
					        msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path)
 | 
				
			||||||
    if return_scores:
 | 
					
 | 
				
			||||||
        return scorer.scores
 | 
					    if output_path is not None:
 | 
				
			||||||
 | 
					        srsly.write_json(output_path, data)
 | 
				
			||||||
 | 
					        msg.good(f"Saved results to {output_path}")
 | 
				
			||||||
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def render_parses(
 | 
					def render_parses(
 | 
				
			||||||
| 
						 | 
					@ -128,3 +142,40 @@ def render_parses(
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
 | 
					        with (output_path / "parses.html").open("w", encoding="utf8") as file_:
 | 
				
			||||||
            file_.write(html)
 | 
					            file_.write(html)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
 | 
				
			||||||
 | 
					    data = [
 | 
				
			||||||
 | 
					        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
 | 
				
			||||||
 | 
					        for k, v in scores.items()
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    msg.table(
 | 
				
			||||||
 | 
					        data,
 | 
				
			||||||
 | 
					        header=("", "P", "R", "F"),
 | 
				
			||||||
 | 
					        aligns=("l", "r", "r", "r"),
 | 
				
			||||||
 | 
					        title="NER (per type)",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None:
 | 
				
			||||||
 | 
					    data = [
 | 
				
			||||||
 | 
					        (k, f"{v['p']:.2f}", f"{v['r']:.2f}", f"{v['f']:.2f}")
 | 
				
			||||||
 | 
					        for k, v in scores.items()
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    msg.table(
 | 
				
			||||||
 | 
					        data,
 | 
				
			||||||
 | 
					        header=("", "P", "R", "F"),
 | 
				
			||||||
 | 
					        aligns=("l", "r", "r", "r"),
 | 
				
			||||||
 | 
					        title="Textcat F (per type)",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def print_textcats_auc_per_cat(
 | 
				
			||||||
 | 
					    msg: Printer, scores: Dict[str, Dict[str, float]]
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
 | 
					    msg.table(
 | 
				
			||||||
 | 
					        [(k, f"{v['roc_auc_score']:.2f}") for k, v in scores.items()],
 | 
				
			||||||
 | 
					        header=("", "ROC AUC"),
 | 
				
			||||||
 | 
					        aligns=("l", "r"),
 | 
				
			||||||
 | 
					        title="Textcat ROC AUC (per label)",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,8 +16,9 @@ def package_cli(
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
    input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
 | 
					    input_dir: Path = Arg(..., help="Directory with model data", exists=True, file_okay=False),
 | 
				
			||||||
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
 | 
					    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
 | 
				
			||||||
    meta_path: Optional[Path] = Opt(None, "--meta-path", "-m", help="Path to meta.json", exists=True, dir_okay=False),
 | 
					    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
 | 
				
			||||||
    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
 | 
					    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
 | 
				
			||||||
 | 
					    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
 | 
				
			||||||
    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
 | 
					    force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing model in output directory"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
| 
						 | 
					@ -32,6 +33,7 @@ def package_cli(
 | 
				
			||||||
        input_dir,
 | 
					        input_dir,
 | 
				
			||||||
        output_dir,
 | 
					        output_dir,
 | 
				
			||||||
        meta_path=meta_path,
 | 
					        meta_path=meta_path,
 | 
				
			||||||
 | 
					        version=version,
 | 
				
			||||||
        create_meta=create_meta,
 | 
					        create_meta=create_meta,
 | 
				
			||||||
        force=force,
 | 
					        force=force,
 | 
				
			||||||
        silent=False,
 | 
					        silent=False,
 | 
				
			||||||
| 
						 | 
					@ -42,6 +44,7 @@ def package(
 | 
				
			||||||
    input_dir: Path,
 | 
					    input_dir: Path,
 | 
				
			||||||
    output_dir: Path,
 | 
					    output_dir: Path,
 | 
				
			||||||
    meta_path: Optional[Path] = None,
 | 
					    meta_path: Optional[Path] = None,
 | 
				
			||||||
 | 
					    version: Optional[str] = None,
 | 
				
			||||||
    create_meta: bool = False,
 | 
					    create_meta: bool = False,
 | 
				
			||||||
    force: bool = False,
 | 
					    force: bool = False,
 | 
				
			||||||
    silent: bool = True,
 | 
					    silent: bool = True,
 | 
				
			||||||
| 
						 | 
					@ -61,10 +64,13 @@ def package(
 | 
				
			||||||
    if not meta_path.exists() or not meta_path.is_file():
 | 
					    if not meta_path.exists() or not meta_path.is_file():
 | 
				
			||||||
        msg.fail("Can't load model meta.json", meta_path, exits=1)
 | 
					        msg.fail("Can't load model meta.json", meta_path, exits=1)
 | 
				
			||||||
    meta = srsly.read_json(meta_path)
 | 
					    meta = srsly.read_json(meta_path)
 | 
				
			||||||
 | 
					    meta = get_meta(input_dir, meta)
 | 
				
			||||||
 | 
					    if version is not None:
 | 
				
			||||||
 | 
					        meta["version"] = version
 | 
				
			||||||
    if not create_meta:  # only print if user doesn't want to overwrite
 | 
					    if not create_meta:  # only print if user doesn't want to overwrite
 | 
				
			||||||
        msg.good("Loaded meta.json from file", meta_path)
 | 
					        msg.good("Loaded meta.json from file", meta_path)
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        meta = generate_meta(input_dir, meta, msg)
 | 
					        meta = generate_meta(meta, msg)
 | 
				
			||||||
    errors = validate(ModelMetaSchema, meta)
 | 
					    errors = validate(ModelMetaSchema, meta)
 | 
				
			||||||
    if errors:
 | 
					    if errors:
 | 
				
			||||||
        msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
 | 
					        msg.fail("Invalid model meta.json", "\n".join(errors), exits=1)
 | 
				
			||||||
| 
						 | 
					@ -101,20 +107,20 @@ def create_file(file_path: Path, contents: str) -> None:
 | 
				
			||||||
    file_path.open("w", encoding="utf-8").write(contents)
 | 
					    file_path.open("w", encoding="utf-8").write(contents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def generate_meta(
 | 
					def get_meta(
 | 
				
			||||||
    model_path: Union[str, Path], existing_meta: Dict[str, Any], msg: Printer
 | 
					    model_path: Union[str, Path], existing_meta: Dict[str, Any]
 | 
				
			||||||
) -> Dict[str, Any]:
 | 
					) -> Dict[str, Any]:
 | 
				
			||||||
    meta = existing_meta or {}
 | 
					    meta = {
 | 
				
			||||||
    settings = [
 | 
					        "lang": "en",
 | 
				
			||||||
        ("lang", "Model language", meta.get("lang", "en")),
 | 
					        "name": "model",
 | 
				
			||||||
        ("name", "Model name", meta.get("name", "model")),
 | 
					        "version": "0.0.0",
 | 
				
			||||||
        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
					        "description": None,
 | 
				
			||||||
        ("description", "Model description", meta.get("description", False)),
 | 
					        "author": None,
 | 
				
			||||||
        ("author", "Author", meta.get("author", False)),
 | 
					        "email": None,
 | 
				
			||||||
        ("email", "Author email", meta.get("email", False)),
 | 
					        "url": None,
 | 
				
			||||||
        ("url", "Author website", meta.get("url", False)),
 | 
					        "license": "MIT",
 | 
				
			||||||
        ("license", "License", meta.get("license", "MIT")),
 | 
					    }
 | 
				
			||||||
    ]
 | 
					    meta.update(existing_meta)
 | 
				
			||||||
    nlp = util.load_model_from_path(Path(model_path))
 | 
					    nlp = util.load_model_from_path(Path(model_path))
 | 
				
			||||||
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
 | 
					    meta["spacy_version"] = util.get_model_version_range(about.__version__)
 | 
				
			||||||
    meta["pipeline"] = nlp.pipe_names
 | 
					    meta["pipeline"] = nlp.pipe_names
 | 
				
			||||||
| 
						 | 
					@ -124,6 +130,23 @@ def generate_meta(
 | 
				
			||||||
        "keys": nlp.vocab.vectors.n_keys,
 | 
					        "keys": nlp.vocab.vectors.n_keys,
 | 
				
			||||||
        "name": nlp.vocab.vectors.name,
 | 
					        "name": nlp.vocab.vectors.name,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					    if about.__title__ != "spacy":
 | 
				
			||||||
 | 
					        meta["parent_package"] = about.__title__
 | 
				
			||||||
 | 
					    return meta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def generate_meta(existing_meta: Dict[str, Any], msg: Printer) -> Dict[str, Any]:
 | 
				
			||||||
 | 
					    meta = existing_meta or {}
 | 
				
			||||||
 | 
					    settings = [
 | 
				
			||||||
 | 
					        ("lang", "Model language", meta.get("lang", "en")),
 | 
				
			||||||
 | 
					        ("name", "Model name", meta.get("name", "model")),
 | 
				
			||||||
 | 
					        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
				
			||||||
 | 
					        ("description", "Model description", meta.get("description", None)),
 | 
				
			||||||
 | 
					        ("author", "Author", meta.get("author", None)),
 | 
				
			||||||
 | 
					        ("email", "Author email", meta.get("email", None)),
 | 
				
			||||||
 | 
					        ("url", "Author website", meta.get("url", None)),
 | 
				
			||||||
 | 
					        ("license", "License", meta.get("license", "MIT")),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
    msg.divider("Generating meta.json")
 | 
					    msg.divider("Generating meta.json")
 | 
				
			||||||
    msg.text(
 | 
					    msg.text(
 | 
				
			||||||
        "Enter the package settings for your model. The following information "
 | 
					        "Enter the package settings for your model. The following information "
 | 
				
			||||||
| 
						 | 
					@ -132,8 +155,6 @@ def generate_meta(
 | 
				
			||||||
    for setting, desc, default in settings:
 | 
					    for setting, desc, default in settings:
 | 
				
			||||||
        response = get_raw_input(desc, default)
 | 
					        response = get_raw_input(desc, default)
 | 
				
			||||||
        meta[setting] = default if response == "" and default else response
 | 
					        meta[setting] = default if response == "" and default else response
 | 
				
			||||||
    if about.__title__ != "spacy":
 | 
					 | 
				
			||||||
        meta["parent_package"] = about.__title__
 | 
					 | 
				
			||||||
    return meta
 | 
					    return meta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -184,12 +205,12 @@ def setup_package():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    setup(
 | 
					    setup(
 | 
				
			||||||
        name=model_name,
 | 
					        name=model_name,
 | 
				
			||||||
        description=meta['description'],
 | 
					        description=meta.get('description'),
 | 
				
			||||||
        author=meta['author'],
 | 
					        author=meta.get('author'),
 | 
				
			||||||
        author_email=meta['email'],
 | 
					        author_email=meta.get('email'),
 | 
				
			||||||
        url=meta['url'],
 | 
					        url=meta.get('url'),
 | 
				
			||||||
        version=meta['version'],
 | 
					        version=meta['version'],
 | 
				
			||||||
        license=meta['license'],
 | 
					        license=meta.get('license'),
 | 
				
			||||||
        packages=[model_name],
 | 
					        packages=[model_name],
 | 
				
			||||||
        package_data={model_name: list_files(model_dir)},
 | 
					        package_data={model_name: list_files(model_dir)},
 | 
				
			||||||
        install_requires=list_requirements(meta),
 | 
					        install_requires=list_requirements(meta),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										679
									
								
								spacy/cli/project.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										679
									
								
								spacy/cli/project.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,679 @@
 | 
				
			||||||
 | 
					from typing import List, Dict, Any, Optional, Sequence
 | 
				
			||||||
 | 
					import typer
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					from wasabi import msg
 | 
				
			||||||
 | 
					import subprocess
 | 
				
			||||||
 | 
					import shlex
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import sys
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ._app import app, Arg, Opt, COMMAND, NAME
 | 
				
			||||||
 | 
					from .. import about
 | 
				
			||||||
 | 
					from ..schemas import ProjectConfigSchema, validate
 | 
				
			||||||
 | 
					from ..util import ensure_path, run_command, make_tempdir, working_dir
 | 
				
			||||||
 | 
					from ..util import get_hash, get_checksum
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CONFIG_FILE = "project.yml"
 | 
				
			||||||
 | 
					DVC_CONFIG = "dvc.yaml"
 | 
				
			||||||
 | 
					DIRS = [
 | 
				
			||||||
 | 
					    "assets",
 | 
				
			||||||
 | 
					    "metas",
 | 
				
			||||||
 | 
					    "configs",
 | 
				
			||||||
 | 
					    "packages",
 | 
				
			||||||
 | 
					    "metrics",
 | 
				
			||||||
 | 
					    "scripts",
 | 
				
			||||||
 | 
					    "notebooks",
 | 
				
			||||||
 | 
					    "training",
 | 
				
			||||||
 | 
					    "corpus",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					CACHES = [
 | 
				
			||||||
 | 
					    Path.home() / ".torch",
 | 
				
			||||||
 | 
					    Path.home() / ".caches" / "torch",
 | 
				
			||||||
 | 
					    os.environ.get("TORCH_HOME"),
 | 
				
			||||||
 | 
					    Path.home() / ".keras",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					DVC_CONFIG_COMMENT = """# This file is auto-generated by spaCy based on your project.yml. Do not edit
 | 
				
			||||||
 | 
					# it directly and edit the project.yml instead and re-run the project."""
 | 
				
			||||||
 | 
					CLI_HELP = f"""Command-line interface for spaCy projects and working with project
 | 
				
			||||||
 | 
					templates. You'd typically start by cloning a project template to a local
 | 
				
			||||||
 | 
					directory and fetching its assets like datasets etc. See the project's
 | 
				
			||||||
 | 
					{CONFIG_FILE} for the available commands. Under the hood, spaCy uses DVC (Data
 | 
				
			||||||
 | 
					Version Control) to manage input and output files and to ensure steps are only
 | 
				
			||||||
 | 
					re-run if their inputs change.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					project_cli = typer.Typer(help=CLI_HELP)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.callback(invoke_without_command=True)
 | 
				
			||||||
 | 
					def callback(ctx: typer.Context):
 | 
				
			||||||
 | 
					    """This runs before every project command and ensures DVC is installed."""
 | 
				
			||||||
 | 
					    ensure_dvc()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					################
 | 
				
			||||||
 | 
					# CLI COMMANDS #
 | 
				
			||||||
 | 
					################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command("clone")
 | 
				
			||||||
 | 
					def project_clone_cli(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    name: str = Arg(..., help="The name of the template to fetch"),
 | 
				
			||||||
 | 
					    dest: Path = Arg(Path.cwd(), help="Where to download and work. Defaults to current working directory.", exists=False),
 | 
				
			||||||
 | 
					    repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to look in."),
 | 
				
			||||||
 | 
					    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
 | 
				
			||||||
 | 
					    no_init: bool = Opt(False, "--no-init", "-NI", help="Don't initialize the project with DVC"),
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Clone a project template from a repository. Calls into "git" and will
 | 
				
			||||||
 | 
					    only download the files from the given subdirectory. The GitHub repo
 | 
				
			||||||
 | 
					    defaults to the official spaCy template repo, but can be customized
 | 
				
			||||||
 | 
					    (including using a private repo). Setting the --git flag will also
 | 
				
			||||||
 | 
					    initialize the project directory as a Git repo. If the project is intended
 | 
				
			||||||
 | 
					    to be a Git repo, it should be initialized with Git first, before
 | 
				
			||||||
 | 
					    initializing DVC (Data Version Control). This allows DVC to integrate with
 | 
				
			||||||
 | 
					    Git.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    project_clone(name, dest, repo=repo, git=git, no_init=no_init)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command("init")
 | 
				
			||||||
 | 
					def project_init_cli(
 | 
				
			||||||
 | 
					    path: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
 | 
				
			||||||
 | 
					    git: bool = Opt(False, "--git", "-G", help="Initialize project as a Git repo"),
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Initialize a project directory with DVC and optionally Git. This should
 | 
				
			||||||
 | 
					    typically be taken care of automatically when you run the "project clone"
 | 
				
			||||||
 | 
					    command, but you can also run it separately. If the project is intended to
 | 
				
			||||||
 | 
					    be a Git repo, it should be initialized with Git first, before initializing
 | 
				
			||||||
 | 
					    DVC. This allows DVC to integrate with Git.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    project_init(path, git=git, silent=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command("assets")
 | 
				
			||||||
 | 
					def project_assets_cli(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    project_dir: Path = Arg(..., help="Path to cloned project", exists=True, file_okay=False),
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Use DVC (Data Version Control) to fetch project assets. Assets are
 | 
				
			||||||
 | 
					    defined in the "assets" section of the project config. If possible, DVC
 | 
				
			||||||
 | 
					    will try to track the files so you can pull changes from upstream. It will
 | 
				
			||||||
 | 
					    also try and store the checksum so the assets are versioned. If th file
 | 
				
			||||||
 | 
					    can't be tracked or checked, it will be downloaded without DVC. If a checksum
 | 
				
			||||||
 | 
					    is provided in the project config, the file is only downloaded if no local
 | 
				
			||||||
 | 
					    file with the same checksum exists.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    project_assets(project_dir)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command(
 | 
				
			||||||
 | 
					    "run-all",
 | 
				
			||||||
 | 
					    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def project_run_all_cli(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    ctx: typer.Context,
 | 
				
			||||||
 | 
					    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
 | 
				
			||||||
 | 
					    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Run all commands defined in the project. This command will use DVC and
 | 
				
			||||||
 | 
					    the defined outputs and dependencies in the project config to determine
 | 
				
			||||||
 | 
					    which steps need to be re-run and where to start. This means you're only
 | 
				
			||||||
 | 
					    re-generating data if the inputs have changed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This command calls into "dvc repro" and all additional arguments are passed
 | 
				
			||||||
 | 
					    to the "dvc repro" command: https://dvc.org/doc/command-reference/repro
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if show_help:
 | 
				
			||||||
 | 
					        print_run_help(project_dir)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        project_run_all(project_dir, *ctx.args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command(
 | 
				
			||||||
 | 
					    "run", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def project_run_cli(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    ctx: typer.Context,
 | 
				
			||||||
 | 
					    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
 | 
				
			||||||
 | 
					    subcommand: str = Arg(None, help="Name of command defined in project config"),
 | 
				
			||||||
 | 
					    show_help: bool = Opt(False, "--help", help="Show help message and available subcommands")
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Run a named script defined in the project config. If the command is
 | 
				
			||||||
 | 
					    part of the default pipeline defined in the "run" section, DVC is used to
 | 
				
			||||||
 | 
					    determine whether the step should re-run if its inputs have changed, or
 | 
				
			||||||
 | 
					    whether everything is up to date. If the script is not part of the default
 | 
				
			||||||
 | 
					    pipeline, it will be called separately without DVC.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    If DVC is used, the command calls into "dvc repro" and all additional
 | 
				
			||||||
 | 
					    arguments are passed to the "dvc repro" command:
 | 
				
			||||||
 | 
					    https://dvc.org/doc/command-reference/repro
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if show_help or not subcommand:
 | 
				
			||||||
 | 
					        print_run_help(project_dir, subcommand)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        project_run(project_dir, subcommand, *ctx.args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command("exec", hidden=True)
 | 
				
			||||||
 | 
					def project_exec_cli(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
 | 
				
			||||||
 | 
					    subcommand: str = Arg(..., help="Name of command defined in project config"),
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Execute a command defined in the project config. This CLI command is
 | 
				
			||||||
 | 
					    only called internally in auto-generated DVC pipelines, as a shortcut for
 | 
				
			||||||
 | 
					    multi-step commands in the project config. You typically shouldn't have to
 | 
				
			||||||
 | 
					    call it yourself. To run a command, call "run" or "run-all".
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    project_exec(project_dir, subcommand)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@project_cli.command("update-dvc")
 | 
				
			||||||
 | 
					def project_update_dvc_cli(
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
 | 
					    project_dir: Path = Arg(..., help="Location of project directory", exists=True, file_okay=False),
 | 
				
			||||||
 | 
					    verbose: bool = Opt(False, "--verbose", "-V", help="Print more info"),
 | 
				
			||||||
 | 
					    force: bool = Opt(False, "--force", "-F", help="Force update DVC config"),
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Update the auto-generated DVC config file. Uses the steps defined in the
 | 
				
			||||||
 | 
					    "run" section of the project config. This typically happens automatically
 | 
				
			||||||
 | 
					    when running a command, but can also be triggered manually if needed.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config = load_project_config(project_dir)
 | 
				
			||||||
 | 
					    updated = update_dvc_config(project_dir, config, verbose=verbose, force=force)
 | 
				
			||||||
 | 
					    if updated:
 | 
				
			||||||
 | 
					        msg.good(f"Updated DVC config from {CONFIG_FILE}")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        msg.info(f"No changes found in {CONFIG_FILE}, no update needed")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					app.add_typer(project_cli, name="project")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#################
 | 
				
			||||||
 | 
					# CLI FUNCTIONS #
 | 
				
			||||||
 | 
					#################
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def project_clone(
 | 
				
			||||||
 | 
					    name: str,
 | 
				
			||||||
 | 
					    dest: Path,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    repo: str = about.__projects__,
 | 
				
			||||||
 | 
					    git: bool = False,
 | 
				
			||||||
 | 
					    no_init: bool = False,
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
 | 
					    """Clone a project template from a repository.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    name (str): Name of subdirectory to clone.
 | 
				
			||||||
 | 
					    dest (Path): Destination path of cloned project.
 | 
				
			||||||
 | 
					    repo (str): URL of Git repo containing project templates.
 | 
				
			||||||
 | 
					    git (bool): Initialize project as Git repo. Should be set to True if project
 | 
				
			||||||
 | 
					        is intended as a repo, since it will allow DVC to integrate with Git.
 | 
				
			||||||
 | 
					    no_init (bool): Don't initialize DVC and Git automatically. If True, the
 | 
				
			||||||
 | 
					        "init" command or "git init" and "dvc init" need to be run manually.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    dest = ensure_path(dest)
 | 
				
			||||||
 | 
					    check_clone(name, dest, repo)
 | 
				
			||||||
 | 
					    project_dir = dest.resolve()
 | 
				
			||||||
 | 
					    # We're using Git and sparse checkout to only clone the files we need
 | 
				
			||||||
 | 
					    with make_tempdir() as tmp_dir:
 | 
				
			||||||
 | 
					        cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 --config core.sparseCheckout=true"
 | 
				
			||||||
 | 
					        run_command(shlex.split(cmd))
 | 
				
			||||||
 | 
					        with (tmp_dir / ".git" / "info" / "sparse-checkout").open("w") as f:
 | 
				
			||||||
 | 
					            f.write(name)
 | 
				
			||||||
 | 
					        run_command(["git", "-C", tmp_dir, "fetch"])
 | 
				
			||||||
 | 
					        run_command(["git", "-C", tmp_dir, "checkout"])
 | 
				
			||||||
 | 
					        shutil.move(str(tmp_dir / Path(name).name), str(project_dir))
 | 
				
			||||||
 | 
					    msg.good(f"Cloned project '{name}' from {repo}")
 | 
				
			||||||
 | 
					    for sub_dir in DIRS:
 | 
				
			||||||
 | 
					        dir_path = project_dir / sub_dir
 | 
				
			||||||
 | 
					        if not dir_path.exists():
 | 
				
			||||||
 | 
					            dir_path.mkdir(parents=True)
 | 
				
			||||||
 | 
					    if not no_init:
 | 
				
			||||||
 | 
					        project_init(project_dir, git=git, silent=True)
 | 
				
			||||||
 | 
					    msg.good(f"Your project is now ready!", dest)
 | 
				
			||||||
 | 
					    print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def project_init(
 | 
				
			||||||
 | 
					    project_dir: Path,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    git: bool = False,
 | 
				
			||||||
 | 
					    silent: bool = False,
 | 
				
			||||||
 | 
					    analytics: bool = False,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Initialize a project as a DVC and (optionally) as a Git repo.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): Path to project directory.
 | 
				
			||||||
 | 
					    git (bool): Also call "git init" to initialize directory as a Git repo.
 | 
				
			||||||
 | 
					    silent (bool): Don't print any output (via DVC).
 | 
				
			||||||
 | 
					    analytics (bool): Opt-in to DVC analytics (defaults to False).
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    with working_dir(project_dir):
 | 
				
			||||||
 | 
					        init_cmd = ["dvc", "init"]
 | 
				
			||||||
 | 
					        if silent:
 | 
				
			||||||
 | 
					            init_cmd.append("--quiet")
 | 
				
			||||||
 | 
					        if not git:
 | 
				
			||||||
 | 
					            init_cmd.append("--no-scm")
 | 
				
			||||||
 | 
					        if git:
 | 
				
			||||||
 | 
					            run_command(["git", "init"])
 | 
				
			||||||
 | 
					        run_command(init_cmd)
 | 
				
			||||||
 | 
					        # We don't want to have analytics on by default – our users should
 | 
				
			||||||
 | 
					        # opt-in explicitly. If they want it, they can always enable it.
 | 
				
			||||||
 | 
					        if not analytics:
 | 
				
			||||||
 | 
					            run_command(["dvc", "config", "core.analytics", "false"])
 | 
				
			||||||
 | 
					        config = load_project_config(project_dir)
 | 
				
			||||||
 | 
					        setup_check_dvc(project_dir, config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def project_assets(project_dir: Path) -> None:
 | 
				
			||||||
 | 
					    """Fetch assets for a project using DVC if possible.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): Path to project directory.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    project_path = ensure_path(project_dir)
 | 
				
			||||||
 | 
					    config = load_project_config(project_path)
 | 
				
			||||||
 | 
					    setup_check_dvc(project_path, config)
 | 
				
			||||||
 | 
					    assets = config.get("assets", {})
 | 
				
			||||||
 | 
					    if not assets:
 | 
				
			||||||
 | 
					        msg.warn(f"No assets specified in {CONFIG_FILE}", exits=0)
 | 
				
			||||||
 | 
					    msg.info(f"Fetching {len(assets)} asset(s)")
 | 
				
			||||||
 | 
					    variables = config.get("variables", {})
 | 
				
			||||||
 | 
					    fetched_assets = []
 | 
				
			||||||
 | 
					    for asset in assets:
 | 
				
			||||||
 | 
					        url = asset["url"].format(**variables)
 | 
				
			||||||
 | 
					        dest = asset["dest"].format(**variables)
 | 
				
			||||||
 | 
					        fetched_path = fetch_asset(project_path, url, dest, asset.get("checksum"))
 | 
				
			||||||
 | 
					        if fetched_path:
 | 
				
			||||||
 | 
					            fetched_assets.append(str(fetched_path))
 | 
				
			||||||
 | 
					    if fetched_assets:
 | 
				
			||||||
 | 
					        with working_dir(project_path):
 | 
				
			||||||
 | 
					            run_command(["dvc", "add", *fetched_assets, "--external"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fetch_asset(
 | 
				
			||||||
 | 
					    project_path: Path, url: str, dest: Path, checksum: Optional[str] = None
 | 
				
			||||||
 | 
					) -> Optional[Path]:
 | 
				
			||||||
 | 
					    """Fetch an asset from a given URL or path. Will try to import the file
 | 
				
			||||||
 | 
					    using DVC's import-url if possible (fully tracked and versioned) and falls
 | 
				
			||||||
 | 
					    back to get-url (versioned) and a non-DVC download if necessary. If a
 | 
				
			||||||
 | 
					    checksum is provided and a local file exists, it's only re-downloaded if the
 | 
				
			||||||
 | 
					    checksum doesn't match.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_path (Path): Path to project directory.
 | 
				
			||||||
 | 
					    url (str): URL or path to asset.
 | 
				
			||||||
 | 
					    checksum (Optional[str]): Optional expected checksum of local file.
 | 
				
			||||||
 | 
					    RETURNS (Optional[Path]): The path to the fetched asset or None if fetching
 | 
				
			||||||
 | 
					        the asset failed.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    url = convert_asset_url(url)
 | 
				
			||||||
 | 
					    dest_path = (project_path / dest).resolve()
 | 
				
			||||||
 | 
					    if dest_path.exists() and checksum:
 | 
				
			||||||
 | 
					        # If there's already a file, check for checksum
 | 
				
			||||||
 | 
					        # TODO: add support for caches (dvc import-url with local path)
 | 
				
			||||||
 | 
					        if checksum == get_checksum(dest_path):
 | 
				
			||||||
 | 
					            msg.good(f"Skipping download with matching checksum: {dest}")
 | 
				
			||||||
 | 
					            return dest_path
 | 
				
			||||||
 | 
					    with working_dir(project_path):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            # If these fail, we don't want to output an error or info message.
 | 
				
			||||||
 | 
					            # Try with tracking the source first, then just downloading with
 | 
				
			||||||
 | 
					            # DVC, then a regular non-DVC download.
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                dvc_cmd = ["dvc", "import-url", url, str(dest_path)]
 | 
				
			||||||
 | 
					                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
 | 
				
			||||||
 | 
					            except subprocess.CalledProcessError:
 | 
				
			||||||
 | 
					                dvc_cmd = ["dvc", "get-url", url, str(dest_path)]
 | 
				
			||||||
 | 
					                print(subprocess.check_output(dvc_cmd, stderr=subprocess.DEVNULL))
 | 
				
			||||||
 | 
					        except subprocess.CalledProcessError:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                download_file(url, dest_path)
 | 
				
			||||||
 | 
					            except requests.exceptions.HTTPError as e:
 | 
				
			||||||
 | 
					                msg.fail(f"Download failed: {dest}", e)
 | 
				
			||||||
 | 
					                return None
 | 
				
			||||||
 | 
					    if checksum and checksum != get_checksum(dest_path):
 | 
				
			||||||
 | 
					        msg.warn(f"Checksum doesn't match value defined in {CONFIG_FILE}: {dest}")
 | 
				
			||||||
 | 
					    msg.good(f"Fetched asset {dest}")
 | 
				
			||||||
 | 
					    return dest_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def project_run_all(project_dir: Path, *dvc_args) -> None:
 | 
				
			||||||
 | 
					    """Run all commands defined in the project using DVC.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): Path to project directory.
 | 
				
			||||||
 | 
					    *dvc_args: Other arguments passed to "dvc repro".
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config = load_project_config(project_dir)
 | 
				
			||||||
 | 
					    setup_check_dvc(project_dir, config)
 | 
				
			||||||
 | 
					    dvc_cmd = ["dvc", "repro", *dvc_args]
 | 
				
			||||||
 | 
					    with working_dir(project_dir):
 | 
				
			||||||
 | 
					        run_command(dvc_cmd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
 | 
				
			||||||
 | 
					    """Simulate a CLI help prompt using the info available in the project config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): The project directory.
 | 
				
			||||||
 | 
					    subcommand (Optional[str]): The subcommand or None. If a subcommand is
 | 
				
			||||||
 | 
					        provided, the subcommand help is shown. Otherwise, the top-level help
 | 
				
			||||||
 | 
					        and a list of available commands is printed.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config = load_project_config(project_dir)
 | 
				
			||||||
 | 
					    setup_check_dvc(project_dir, config)
 | 
				
			||||||
 | 
					    config_commands = config.get("commands", [])
 | 
				
			||||||
 | 
					    commands = {cmd["name"]: cmd for cmd in config_commands}
 | 
				
			||||||
 | 
					    if subcommand:
 | 
				
			||||||
 | 
					        validate_subcommand(commands.keys(), subcommand)
 | 
				
			||||||
 | 
					        print(f"Usage: {COMMAND} project run {project_dir} {subcommand}")
 | 
				
			||||||
 | 
					        help_text = commands[subcommand].get("help")
 | 
				
			||||||
 | 
					        if help_text:
 | 
				
			||||||
 | 
					            msg.text(f"\n{help_text}\n")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        print(f"\nAvailable commands in {CONFIG_FILE}")
 | 
				
			||||||
 | 
					        print(f"Usage: {COMMAND} project run {project_dir} [COMMAND]")
 | 
				
			||||||
 | 
					        msg.table([(cmd["name"], cmd.get("help", "")) for cmd in config_commands])
 | 
				
			||||||
 | 
					        msg.text("Run all commands defined in the 'run' block of the project config:")
 | 
				
			||||||
 | 
					        print(f"{COMMAND} project run-all {project_dir}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def project_run(project_dir: Path, subcommand: str, *dvc_args) -> None:
 | 
				
			||||||
 | 
					    """Run a named script defined in the project config. If the script is part
 | 
				
			||||||
 | 
					    of the default pipeline (defined in the "run" section), DVC is used to
 | 
				
			||||||
 | 
					    execute the command, so it can determine whether to rerun it. It then
 | 
				
			||||||
 | 
					    calls into "exec" to execute it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): Path to project directory.
 | 
				
			||||||
 | 
					    subcommand (str): Name of command to run.
 | 
				
			||||||
 | 
					    *dvc_args: Other arguments passed to "dvc repro".
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config = load_project_config(project_dir)
 | 
				
			||||||
 | 
					    setup_check_dvc(project_dir, config)
 | 
				
			||||||
 | 
					    config_commands = config.get("commands", [])
 | 
				
			||||||
 | 
					    variables = config.get("variables", {})
 | 
				
			||||||
 | 
					    commands = {cmd["name"]: cmd for cmd in config_commands}
 | 
				
			||||||
 | 
					    validate_subcommand(commands.keys(), subcommand)
 | 
				
			||||||
 | 
					    if subcommand in config.get("run", []):
 | 
				
			||||||
 | 
					        # This is one of the pipeline commands tracked in DVC
 | 
				
			||||||
 | 
					        dvc_cmd = ["dvc", "repro", subcommand, *dvc_args]
 | 
				
			||||||
 | 
					        with working_dir(project_dir):
 | 
				
			||||||
 | 
					            run_command(dvc_cmd)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        cmd = commands[subcommand]
 | 
				
			||||||
 | 
					        # Deps in non-DVC commands aren't tracked, but if they're defined,
 | 
				
			||||||
 | 
					        # make sure they exist before running the command
 | 
				
			||||||
 | 
					        for dep in cmd.get("deps", []):
 | 
				
			||||||
 | 
					            if not (project_dir / dep).exists():
 | 
				
			||||||
 | 
					                err = f"Missing dependency specified by command '{subcommand}': {dep}"
 | 
				
			||||||
 | 
					                msg.fail(err, exits=1)
 | 
				
			||||||
 | 
					        with working_dir(project_dir):
 | 
				
			||||||
 | 
					            run_commands(cmd["script"], variables)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def project_exec(project_dir: Path, subcommand: str):
 | 
				
			||||||
 | 
					    """Execute a command defined in the project config.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): Path to project directory.
 | 
				
			||||||
 | 
					    subcommand (str): Name of command to run.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config = load_project_config(project_dir)
 | 
				
			||||||
 | 
					    config_commands = config.get("commands", [])
 | 
				
			||||||
 | 
					    variables = config.get("variables", {})
 | 
				
			||||||
 | 
					    commands = {cmd["name"]: cmd for cmd in config_commands}
 | 
				
			||||||
 | 
					    with working_dir(project_dir):
 | 
				
			||||||
 | 
					        run_commands(commands[subcommand]["script"], variables)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					###########
 | 
				
			||||||
 | 
					# HELPERS #
 | 
				
			||||||
 | 
					###########
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def load_project_config(path: Path) -> Dict[str, Any]:
 | 
				
			||||||
 | 
					    """Load the project config file from a directory and validate it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    path (Path): The path to the project directory.
 | 
				
			||||||
 | 
					    RETURNS (Dict[str, Any]): The loaded project config.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config_path = path / CONFIG_FILE
 | 
				
			||||||
 | 
					    if not config_path.exists():
 | 
				
			||||||
 | 
					        msg.fail("Can't find project config", config_path, exits=1)
 | 
				
			||||||
 | 
					    invalid_err = f"Invalid project config in {CONFIG_FILE}"
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        config = srsly.read_yaml(config_path)
 | 
				
			||||||
 | 
					    except ValueError as e:
 | 
				
			||||||
 | 
					        msg.fail(invalid_err, e, exits=1)
 | 
				
			||||||
 | 
					    errors = validate(ProjectConfigSchema, config)
 | 
				
			||||||
 | 
					    if errors:
 | 
				
			||||||
 | 
					        msg.fail(invalid_err, "\n".join(errors), exits=1)
 | 
				
			||||||
 | 
					    return config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def update_dvc_config(
 | 
				
			||||||
 | 
					    path: Path,
 | 
				
			||||||
 | 
					    config: Dict[str, Any],
 | 
				
			||||||
 | 
					    verbose: bool = False,
 | 
				
			||||||
 | 
					    silent: bool = False,
 | 
				
			||||||
 | 
					    force: bool = False,
 | 
				
			||||||
 | 
					) -> bool:
 | 
				
			||||||
 | 
					    """Re-run the DVC commands in dry mode and update dvc.yaml file in the
 | 
				
			||||||
 | 
					    project directory. The file is auto-generated based on the config. The
 | 
				
			||||||
 | 
					    first line of the auto-generated file specifies the hash of the config
 | 
				
			||||||
 | 
					    dict, so if any of the config values change, the DVC config is regenerated.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    path (Path): The path to the project directory.
 | 
				
			||||||
 | 
					    config (Dict[str, Any]): The loaded project config.
 | 
				
			||||||
 | 
					    verbose (bool): Whether to print additional info (via DVC).
 | 
				
			||||||
 | 
					    silent (bool): Don't output anything (via DVC).
 | 
				
			||||||
 | 
					    force (bool): Force update, even if hashes match.
 | 
				
			||||||
 | 
					    RETURNS (bool): Whether the DVC config file was updated.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    config_hash = get_hash(config)
 | 
				
			||||||
 | 
					    path = path.resolve()
 | 
				
			||||||
 | 
					    dvc_config_path = path / DVC_CONFIG
 | 
				
			||||||
 | 
					    if dvc_config_path.exists():
 | 
				
			||||||
 | 
					        # Cneck if the file was generated using the current config, if not, redo
 | 
				
			||||||
 | 
					        with dvc_config_path.open("r", encoding="utf8") as f:
 | 
				
			||||||
 | 
					            ref_hash = f.readline().strip().replace("# ", "")
 | 
				
			||||||
 | 
					        if ref_hash == config_hash and not force:
 | 
				
			||||||
 | 
					            return False  # Nothing has changed in project config, don't need to update
 | 
				
			||||||
 | 
					        dvc_config_path.unlink()
 | 
				
			||||||
 | 
					    variables = config.get("variables", {})
 | 
				
			||||||
 | 
					    commands = []
 | 
				
			||||||
 | 
					    # We only want to include commands that are part of the main list of "run"
 | 
				
			||||||
 | 
					    # commands in project.yml and should be run in sequence
 | 
				
			||||||
 | 
					    config_commands = {cmd["name"]: cmd for cmd in config.get("commands", [])}
 | 
				
			||||||
 | 
					    for name in config.get("run", []):
 | 
				
			||||||
 | 
					        validate_subcommand(config_commands.keys(), name)
 | 
				
			||||||
 | 
					        command = config_commands[name]
 | 
				
			||||||
 | 
					        deps = command.get("deps", [])
 | 
				
			||||||
 | 
					        outputs = command.get("outputs", [])
 | 
				
			||||||
 | 
					        outputs_no_cache = command.get("outputs_no_cache", [])
 | 
				
			||||||
 | 
					        if not deps and not outputs and not outputs_no_cache:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        # Default to "." as the project path since dvc.yaml is auto-generated
 | 
				
			||||||
 | 
					        # and we don't want arbitrary paths in there
 | 
				
			||||||
 | 
					        project_cmd = ["python", "-m", NAME, "project", "exec", ".", name]
 | 
				
			||||||
 | 
					        deps_cmd = [c for cl in [["-d", p] for p in deps] for c in cl]
 | 
				
			||||||
 | 
					        outputs_cmd = [c for cl in [["-o", p] for p in outputs] for c in cl]
 | 
				
			||||||
 | 
					        outputs_nc_cmd = [c for cl in [["-O", p] for p in outputs_no_cache] for c in cl]
 | 
				
			||||||
 | 
					        dvc_cmd = ["dvc", "run", "-n", name, "-w", str(path), "--no-exec"]
 | 
				
			||||||
 | 
					        if verbose:
 | 
				
			||||||
 | 
					            dvc_cmd.append("--verbose")
 | 
				
			||||||
 | 
					        if silent:
 | 
				
			||||||
 | 
					            dvc_cmd.append("--quiet")
 | 
				
			||||||
 | 
					        full_cmd = [*dvc_cmd, *deps_cmd, *outputs_cmd, *outputs_nc_cmd, *project_cmd]
 | 
				
			||||||
 | 
					        commands.append(" ".join(full_cmd))
 | 
				
			||||||
 | 
					    with working_dir(path):
 | 
				
			||||||
 | 
					        run_commands(commands, variables, silent=True)
 | 
				
			||||||
 | 
					    with dvc_config_path.open("r+", encoding="utf8") as f:
 | 
				
			||||||
 | 
					        content = f.read()
 | 
				
			||||||
 | 
					        f.seek(0, 0)
 | 
				
			||||||
 | 
					        f.write(f"# {config_hash}\n{DVC_CONFIG_COMMENT}\n{content}")
 | 
				
			||||||
 | 
					    return True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def ensure_dvc() -> None:
 | 
				
			||||||
 | 
					    """Ensure that the "dvc" command is available and show an error if not."""
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        subprocess.run(["dvc", "--version"], stdout=subprocess.DEVNULL)
 | 
				
			||||||
 | 
					    except Exception:
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            "spaCy projects require DVC (Data Version Control) and the 'dvc' command",
 | 
				
			||||||
 | 
					            "You can install the Python package from pip (pip install dvc) or "
 | 
				
			||||||
 | 
					            "conda (conda install -c conda-forge dvc). For more details, see the "
 | 
				
			||||||
 | 
					            "documentation: https://dvc.org/doc/install",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def setup_check_dvc(project_dir: Path, config: Dict[str, Any]) -> None:
 | 
				
			||||||
 | 
					    """Check that the project is set up correctly with DVC and update its
 | 
				
			||||||
 | 
					    config if needed. Will raise an error if the project is not an initialized
 | 
				
			||||||
 | 
					    DVC project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    project_dir (Path): The path to the project directory.
 | 
				
			||||||
 | 
					    config (Dict[str, Any]): The loaded project config.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if not project_dir.exists():
 | 
				
			||||||
 | 
					        msg.fail(f"Can't find project directory: {project_dir}")
 | 
				
			||||||
 | 
					    if not (project_dir / ".dvc").exists():
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            "Project not initialized as a DVC project.",
 | 
				
			||||||
 | 
					            f"Make sure that the project template was cloned correctly. To "
 | 
				
			||||||
 | 
					            f"initialize the project directory manually, you can run: "
 | 
				
			||||||
 | 
					            f"{COMMAND} project init {project_dir}",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    with msg.loading("Updating DVC config..."):
 | 
				
			||||||
 | 
					        updated = update_dvc_config(project_dir, config, silent=True)
 | 
				
			||||||
 | 
					    if updated:
 | 
				
			||||||
 | 
					        msg.good(f"Updated DVC config from changed {CONFIG_FILE}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def run_commands(
 | 
				
			||||||
 | 
					    commands: List[str] = tuple(), variables: Dict[str, str] = {}, silent: bool = False
 | 
				
			||||||
 | 
					) -> None:
 | 
				
			||||||
 | 
					    """Run a sequence of commands in a subprocess, in order.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    commands (List[str]): The split commands.
 | 
				
			||||||
 | 
					    variables (Dict[str, str]): Dictionary of variable names, mapped to their
 | 
				
			||||||
 | 
					        values. Will be used to substitute format string variables in the
 | 
				
			||||||
 | 
					        commands.
 | 
				
			||||||
 | 
					    silent (boll): Don't print the commands.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    for command in commands:
 | 
				
			||||||
 | 
					        # Substitute variables, e.g. "./{NAME}.json"
 | 
				
			||||||
 | 
					        command = command.format(**variables)
 | 
				
			||||||
 | 
					        command = shlex.split(command)
 | 
				
			||||||
 | 
					        # TODO: is this needed / a good idea?
 | 
				
			||||||
 | 
					        if len(command) and command[0] == "python":
 | 
				
			||||||
 | 
					            command[0] = sys.executable
 | 
				
			||||||
 | 
					        elif len(command) and command[0] == "pip":
 | 
				
			||||||
 | 
					            command = [sys.executable, "-m", "pip", *command[1:]]
 | 
				
			||||||
 | 
					        if not silent:
 | 
				
			||||||
 | 
					            print(" ".join(command))
 | 
				
			||||||
 | 
					        run_command(command)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_asset_url(url: str) -> str:
 | 
				
			||||||
 | 
					    """Check and convert the asset URL if needed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    url (str): The asset URL.
 | 
				
			||||||
 | 
					    RETURNS (str): The converted URL.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # If the asset URL is a regular GitHub URL it's likely a mistake
 | 
				
			||||||
 | 
					    if re.match("(http(s?)):\/\/github.com", url):
 | 
				
			||||||
 | 
					        converted = url.replace("github.com", "raw.githubusercontent.com")
 | 
				
			||||||
 | 
					        converted = re.sub(r"/(tree|blob)/", "/", converted)
 | 
				
			||||||
 | 
					        msg.warn(
 | 
				
			||||||
 | 
					            "Downloading from a regular GitHub URL. This will only download "
 | 
				
			||||||
 | 
					            "the source of the page, not the actual file. Converting the URL "
 | 
				
			||||||
 | 
					            "to a raw URL.",
 | 
				
			||||||
 | 
					            converted,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        return converted
 | 
				
			||||||
 | 
					    return url
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def check_clone(name: str, dest: Path, repo: str) -> None:
 | 
				
			||||||
 | 
					    """Check and validate that the destination path can be used to clone. Will
 | 
				
			||||||
 | 
					    check that Git is available and that the destination path is suitable.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    name (str): Name of the directory to clone from the repo.
 | 
				
			||||||
 | 
					    dest (Path): Local destination of cloned directory.
 | 
				
			||||||
 | 
					    repo (str): URL of the repo to clone from.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        subprocess.run(["git", "--version"], stdout=subprocess.DEVNULL)
 | 
				
			||||||
 | 
					    except Exception:
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            f"Cloning spaCy project templates requires Git and the 'git' command. ",
 | 
				
			||||||
 | 
					            f"To clone a project without Git, copy the files from the '{name}' "
 | 
				
			||||||
 | 
					            f"directory in the {repo} to {dest} manually and then run:",
 | 
				
			||||||
 | 
					            f"{COMMAND} project init {dest}",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    if not dest:
 | 
				
			||||||
 | 
					        msg.fail(f"Not a valid directory to clone project: {dest}", exits=1)
 | 
				
			||||||
 | 
					    if dest.exists():
 | 
				
			||||||
 | 
					        # Directory already exists (not allowed, clone needs to create it)
 | 
				
			||||||
 | 
					        msg.fail(f"Can't clone project, directory already exists: {dest}", exits=1)
 | 
				
			||||||
 | 
					    if not dest.parent.exists():
 | 
				
			||||||
 | 
					        # We're not creating parents, parent dir should exist
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def validate_subcommand(commands: Sequence[str], subcommand: str) -> None:
 | 
				
			||||||
 | 
					    """Check that a subcommand is valid and defined. Raises an error otherwise.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    commands (Sequence[str]): The available commands.
 | 
				
			||||||
 | 
					    subcommand (str): The subcommand.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if subcommand not in commands:
 | 
				
			||||||
 | 
					        msg.fail(
 | 
				
			||||||
 | 
					            f"Can't find command '{subcommand}' in {CONFIG_FILE}. "
 | 
				
			||||||
 | 
					            f"Available commands: {', '.join(commands)}",
 | 
				
			||||||
 | 
					            exits=1,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def download_file(url: str, dest: Path, chunk_size: int = 1024) -> None:
 | 
				
			||||||
 | 
					    """Download a file using requests.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    url (str): The URL of the file.
 | 
				
			||||||
 | 
					    dest (Path): The destination path.
 | 
				
			||||||
 | 
					    chunk_size (int): The size of chunks to read/write.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    response = requests.get(url, stream=True)
 | 
				
			||||||
 | 
					    response.raise_for_status()
 | 
				
			||||||
 | 
					    total = int(response.headers.get("content-length", 0))
 | 
				
			||||||
 | 
					    progress_settings = {
 | 
				
			||||||
 | 
					        "total": total,
 | 
				
			||||||
 | 
					        "unit": "iB",
 | 
				
			||||||
 | 
					        "unit_scale": True,
 | 
				
			||||||
 | 
					        "unit_divisor": chunk_size,
 | 
				
			||||||
 | 
					        "leave": False,
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    with dest.open("wb") as f, tqdm.tqdm(**progress_settings) as bar:
 | 
				
			||||||
 | 
					        for data in response.iter_content(chunk_size=chunk_size):
 | 
				
			||||||
 | 
					            size = f.write(data)
 | 
				
			||||||
 | 
					            bar.update(size)
 | 
				
			||||||
| 
						 | 
					@ -132,6 +132,7 @@ class Warnings(object):
 | 
				
			||||||
            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
 | 
					            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
 | 
				
			||||||
    W093 = ("Could not find any data to train the {name} on. Is your "
 | 
					    W093 = ("Could not find any data to train the {name} on. Is your "
 | 
				
			||||||
            "input data correctly formatted ?")
 | 
					            "input data correctly formatted ?")
 | 
				
			||||||
    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
 | 
					    W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
 | 
				
			||||||
| 
						 | 
					@ -154,7 +155,7 @@ class Warnings(object):
 | 
				
			||||||
            "so a default configuration was used.")
 | 
					            "so a default configuration was used.")
 | 
				
			||||||
    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
 | 
					    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
 | 
				
			||||||
            "but got '{type}' instead, so ignoring it.")
 | 
					            "but got '{type}' instead, so ignoring it.")
 | 
				
			||||||
    W100 = ("Skipping unsupported morphological feature(s): {feature}. "
 | 
					    W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
 | 
				
			||||||
            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
 | 
					            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
 | 
				
			||||||
            "string \"Field1=Value1,Value2|Field2=Value3\".")
 | 
					            "string \"Field1=Value1,Value2|Field2=Value3\".")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -182,18 +183,13 @@ class Errors(object):
 | 
				
			||||||
            "`nlp.select_pipes()`, you should remove them explicitly with "
 | 
					            "`nlp.select_pipes()`, you should remove them explicitly with "
 | 
				
			||||||
            "`nlp.remove_pipe()` before the pipeline is restored. Names of "
 | 
					            "`nlp.remove_pipe()` before the pipeline is restored. Names of "
 | 
				
			||||||
            "the new components: {names}")
 | 
					            "the new components: {names}")
 | 
				
			||||||
    E009 = ("The `update` method expects same number of docs and golds, but "
 | 
					 | 
				
			||||||
            "got: {n_docs} docs, {n_golds} golds.")
 | 
					 | 
				
			||||||
    E010 = ("Word vectors set to length 0. This may be because you don't have "
 | 
					    E010 = ("Word vectors set to length 0. This may be because you don't have "
 | 
				
			||||||
            "a model installed or loaded, or because your model doesn't "
 | 
					            "a model installed or loaded, or because your model doesn't "
 | 
				
			||||||
            "include word vectors. For more info, see the docs:\n"
 | 
					            "include word vectors. For more info, see the docs:\n"
 | 
				
			||||||
            "https://spacy.io/usage/models")
 | 
					            "https://spacy.io/usage/models")
 | 
				
			||||||
    E011 = ("Unknown operator: '{op}'. Options: {opts}")
 | 
					    E011 = ("Unknown operator: '{op}'. Options: {opts}")
 | 
				
			||||||
    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
 | 
					    E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
 | 
				
			||||||
    E013 = ("Error selecting action in matcher")
 | 
					 | 
				
			||||||
    E014 = ("Unknown tag ID: {tag}")
 | 
					    E014 = ("Unknown tag ID: {tag}")
 | 
				
			||||||
    E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
 | 
					 | 
				
			||||||
            "`force=True` to overwrite.")
 | 
					 | 
				
			||||||
    E016 = ("MultitaskObjective target should be function or one of: dep, "
 | 
					    E016 = ("MultitaskObjective target should be function or one of: dep, "
 | 
				
			||||||
            "tag, ent, dep_tag_offset, ent_tag.")
 | 
					            "tag, ent, dep_tag_offset, ent_tag.")
 | 
				
			||||||
    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
 | 
					    E017 = ("Can only add unicode or bytes. Got type: {value_type}")
 | 
				
			||||||
| 
						 | 
					@ -201,21 +197,8 @@ class Errors(object):
 | 
				
			||||||
            "refers to an issue with the `Vocab` or `StringStore`.")
 | 
					            "refers to an issue with the `Vocab` or `StringStore`.")
 | 
				
			||||||
    E019 = ("Can't create transition with unknown action ID: {action}. Action "
 | 
					    E019 = ("Can't create transition with unknown action ID: {action}. Action "
 | 
				
			||||||
            "IDs are enumerated in spacy/syntax/{src}.pyx.")
 | 
					            "IDs are enumerated in spacy/syntax/{src}.pyx.")
 | 
				
			||||||
    E020 = ("Could not find a gold-standard action to supervise the "
 | 
					 | 
				
			||||||
            "dependency parser. The tree is non-projective (i.e. it has "
 | 
					 | 
				
			||||||
            "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
 | 
					 | 
				
			||||||
            "The ArcEager transition system only supports projective trees. "
 | 
					 | 
				
			||||||
            "To learn non-projective representations, transform the data "
 | 
					 | 
				
			||||||
            "before training and after parsing. Either pass "
 | 
					 | 
				
			||||||
            "`make_projective=True` to the GoldParse class, or use "
 | 
					 | 
				
			||||||
            "spacy.syntax.nonproj.preprocess_training_data.")
 | 
					 | 
				
			||||||
    E021 = ("Could not find a gold-standard action to supervise the "
 | 
					 | 
				
			||||||
            "dependency parser. The GoldParse was projective. The transition "
 | 
					 | 
				
			||||||
            "system has {n_actions} actions. State at failure: {state}")
 | 
					 | 
				
			||||||
    E022 = ("Could not find a transition with the name '{name}' in the NER "
 | 
					    E022 = ("Could not find a transition with the name '{name}' in the NER "
 | 
				
			||||||
            "model.")
 | 
					            "model.")
 | 
				
			||||||
    E023 = ("Error cleaning up beam: The same state occurred twice at "
 | 
					 | 
				
			||||||
            "memory address {addr} and position {i}.")
 | 
					 | 
				
			||||||
    E024 = ("Could not find an optimal move to supervise the parser. Usually, "
 | 
					    E024 = ("Could not find an optimal move to supervise the parser. Usually, "
 | 
				
			||||||
            "this means that the model can't be updated in a way that's valid "
 | 
					            "this means that the model can't be updated in a way that's valid "
 | 
				
			||||||
            "and satisfies the correct annotations specified in the GoldParse. "
 | 
					            "and satisfies the correct annotations specified in the GoldParse. "
 | 
				
			||||||
| 
						 | 
					@ -259,7 +242,6 @@ class Errors(object):
 | 
				
			||||||
            "offset {start}.")
 | 
					            "offset {start}.")
 | 
				
			||||||
    E037 = ("Error calculating span: Can't find a token ending at character "
 | 
					    E037 = ("Error calculating span: Can't find a token ending at character "
 | 
				
			||||||
            "offset {end}.")
 | 
					            "offset {end}.")
 | 
				
			||||||
    E038 = ("Error finding sentence for span. Infinite loop detected.")
 | 
					 | 
				
			||||||
    E039 = ("Array bounds exceeded while searching for root word. This likely "
 | 
					    E039 = ("Array bounds exceeded while searching for root word. This likely "
 | 
				
			||||||
            "means the parse tree is in an invalid state. Please report this "
 | 
					            "means the parse tree is in an invalid state. Please report this "
 | 
				
			||||||
            "issue here: http://github.com/explosion/spaCy/issues")
 | 
					            "issue here: http://github.com/explosion/spaCy/issues")
 | 
				
			||||||
| 
						 | 
					@ -290,8 +272,6 @@ class Errors(object):
 | 
				
			||||||
    E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
 | 
					    E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
 | 
				
			||||||
    E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
 | 
					    E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
 | 
				
			||||||
            "({rows}, {cols}).")
 | 
					            "({rows}, {cols}).")
 | 
				
			||||||
    E061 = ("Bad file name: {filename}. Example of a valid file name: "
 | 
					 | 
				
			||||||
            "'vectors.128.f.bin'")
 | 
					 | 
				
			||||||
    E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
 | 
					    E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
 | 
				
			||||||
            "and 63 are occupied. You can replace one by specifying the "
 | 
					            "and 63 are occupied. You can replace one by specifying the "
 | 
				
			||||||
            "`flag_id` explicitly, e.g. "
 | 
					            "`flag_id` explicitly, e.g. "
 | 
				
			||||||
| 
						 | 
					@ -305,39 +285,17 @@ class Errors(object):
 | 
				
			||||||
            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
 | 
					            "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
 | 
				
			||||||
    E065 = ("Only one of the vector table's width and shape can be specified. "
 | 
					    E065 = ("Only one of the vector table's width and shape can be specified. "
 | 
				
			||||||
            "Got width {width} and shape {shape}.")
 | 
					            "Got width {width} and shape {shape}.")
 | 
				
			||||||
    E066 = ("Error creating model helper for extracting columns. Can only "
 | 
					 | 
				
			||||||
            "extract columns by positive integer. Got: {value}.")
 | 
					 | 
				
			||||||
    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
 | 
					    E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
 | 
				
			||||||
            "an entity) without a preceding 'B' (beginning of an entity). "
 | 
					            "an entity) without a preceding 'B' (beginning of an entity). "
 | 
				
			||||||
            "Tag sequence:\n{tags}")
 | 
					            "Tag sequence:\n{tags}")
 | 
				
			||||||
    E068 = ("Invalid BILUO tag: '{tag}'.")
 | 
					    E068 = ("Invalid BILUO tag: '{tag}'.")
 | 
				
			||||||
    E069 = ("Invalid gold-standard parse tree. Found cycle between word "
 | 
					 | 
				
			||||||
            "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
 | 
					 | 
				
			||||||
            "with tokens: {doc_tokens}.")
 | 
					 | 
				
			||||||
    E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
 | 
					 | 
				
			||||||
            "does not align with number of annotations ({n_annots}).")
 | 
					 | 
				
			||||||
    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
 | 
					    E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
 | 
				
			||||||
            "match the one in the vocab ({vocab_orth}).")
 | 
					            "match the one in the vocab ({vocab_orth}).")
 | 
				
			||||||
    E072 = ("Error serializing lexeme: expected data length {length}, "
 | 
					 | 
				
			||||||
            "got {bad_length}.")
 | 
					 | 
				
			||||||
    E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
 | 
					    E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
 | 
				
			||||||
            "are of length {length}. You can use `vocab.reset_vectors` to "
 | 
					            "are of length {length}. You can use `vocab.reset_vectors` to "
 | 
				
			||||||
            "clear the existing vectors and resize the table.")
 | 
					            "clear the existing vectors and resize the table.")
 | 
				
			||||||
    E074 = ("Error interpreting compiled match pattern: patterns are expected "
 | 
					    E074 = ("Error interpreting compiled match pattern: patterns are expected "
 | 
				
			||||||
            "to end with the attribute {attr}. Got: {bad_attr}.")
 | 
					            "to end with the attribute {attr}. Got: {bad_attr}.")
 | 
				
			||||||
    E075 = ("Error accepting match: length ({length}) > maximum length "
 | 
					 | 
				
			||||||
            "({max_len}).")
 | 
					 | 
				
			||||||
    E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
 | 
					 | 
				
			||||||
            "has {words} words.")
 | 
					 | 
				
			||||||
    E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
 | 
					 | 
				
			||||||
            "equal number of GoldParse objects ({n_golds}) in batch.")
 | 
					 | 
				
			||||||
    E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
 | 
					 | 
				
			||||||
            "not equal number of words in GoldParse ({words_gold}).")
 | 
					 | 
				
			||||||
    E079 = ("Error computing states in beam: number of predicted beams "
 | 
					 | 
				
			||||||
            "({pbeams}) does not equal number of gold beams ({gbeams}).")
 | 
					 | 
				
			||||||
    E080 = ("Duplicate state found in beam: {key}.")
 | 
					 | 
				
			||||||
    E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
 | 
					 | 
				
			||||||
            "does not equal number of losses ({losses}).")
 | 
					 | 
				
			||||||
    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
 | 
					    E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
 | 
				
			||||||
            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
 | 
					            "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
 | 
				
			||||||
            "match.")
 | 
					            "match.")
 | 
				
			||||||
| 
						 | 
					@ -345,8 +303,6 @@ class Errors(object):
 | 
				
			||||||
            "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
 | 
					            "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
 | 
				
			||||||
    E084 = ("Error assigning label ID {label} to span: not in StringStore.")
 | 
					    E084 = ("Error assigning label ID {label} to span: not in StringStore.")
 | 
				
			||||||
    E085 = ("Can't create lexeme for string '{string}'.")
 | 
					    E085 = ("Can't create lexeme for string '{string}'.")
 | 
				
			||||||
    E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
 | 
					 | 
				
			||||||
            "not match hash {hash_id} in StringStore.")
 | 
					 | 
				
			||||||
    E087 = ("Unknown displaCy style: {style}.")
 | 
					    E087 = ("Unknown displaCy style: {style}.")
 | 
				
			||||||
    E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
 | 
					    E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
 | 
				
			||||||
            "v2.x parser and NER models require roughly 1GB of temporary "
 | 
					            "v2.x parser and NER models require roughly 1GB of temporary "
 | 
				
			||||||
| 
						 | 
					@ -388,7 +344,6 @@ class Errors(object):
 | 
				
			||||||
    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
 | 
					    E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
 | 
				
			||||||
            "token can only be part of one entity, so make sure the entities "
 | 
					            "token can only be part of one entity, so make sure the entities "
 | 
				
			||||||
            "you're setting don't overlap.")
 | 
					            "you're setting don't overlap.")
 | 
				
			||||||
    E104 = ("Can't find JSON schema for '{name}'.")
 | 
					 | 
				
			||||||
    E105 = ("The Doc.print_tree() method is now deprecated. Please use "
 | 
					    E105 = ("The Doc.print_tree() method is now deprecated. Please use "
 | 
				
			||||||
            "Doc.to_json() instead or write your own function.")
 | 
					            "Doc.to_json() instead or write your own function.")
 | 
				
			||||||
    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
 | 
					    E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
 | 
				
			||||||
| 
						 | 
					@ -411,8 +366,6 @@ class Errors(object):
 | 
				
			||||||
            "practically no advantage over pickling the parent Doc directly. "
 | 
					            "practically no advantage over pickling the parent Doc directly. "
 | 
				
			||||||
            "So instead of pickling the span, pickle the Doc it belongs to or "
 | 
					            "So instead of pickling the span, pickle the Doc it belongs to or "
 | 
				
			||||||
            "use Span.as_doc to convert the span to a standalone Doc object.")
 | 
					            "use Span.as_doc to convert the span to a standalone Doc object.")
 | 
				
			||||||
    E113 = ("The newly split token can only have one root (head = 0).")
 | 
					 | 
				
			||||||
    E114 = ("The newly split token needs to have a root (head = 0).")
 | 
					 | 
				
			||||||
    E115 = ("All subtokens must have associated heads.")
 | 
					    E115 = ("All subtokens must have associated heads.")
 | 
				
			||||||
    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
 | 
					    E116 = ("Cannot currently add labels to pretrained text classifier. Add "
 | 
				
			||||||
            "labels before training begins. This functionality was available "
 | 
					            "labels before training begins. This functionality was available "
 | 
				
			||||||
| 
						 | 
					@ -435,12 +388,9 @@ class Errors(object):
 | 
				
			||||||
            "equal to span length ({span_len}).")
 | 
					            "equal to span length ({span_len}).")
 | 
				
			||||||
    E122 = ("Cannot find token to be split. Did it get merged?")
 | 
					    E122 = ("Cannot find token to be split. Did it get merged?")
 | 
				
			||||||
    E123 = ("Cannot find head of token to be split. Did it get merged?")
 | 
					    E123 = ("Cannot find head of token to be split. Did it get merged?")
 | 
				
			||||||
    E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
 | 
					 | 
				
			||||||
    E125 = ("Unexpected value: {value}")
 | 
					    E125 = ("Unexpected value: {value}")
 | 
				
			||||||
    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
 | 
					    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
 | 
				
			||||||
            "This is likely a bug in spaCy, so feel free to open an issue.")
 | 
					            "This is likely a bug in spaCy, so feel free to open an issue.")
 | 
				
			||||||
    E127 = ("Cannot create phrase pattern representation for length 0. This "
 | 
					 | 
				
			||||||
            "is likely a bug in spaCy.")
 | 
					 | 
				
			||||||
    E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
 | 
					    E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
 | 
				
			||||||
            "arguments to exclude fields from being serialized or deserialized "
 | 
					            "arguments to exclude fields from being serialized or deserialized "
 | 
				
			||||||
            "is now deprecated. Please use the `exclude` argument instead. "
 | 
					            "is now deprecated. Please use the `exclude` argument instead. "
 | 
				
			||||||
| 
						 | 
					@ -482,8 +432,6 @@ class Errors(object):
 | 
				
			||||||
            "provided {found}.")
 | 
					            "provided {found}.")
 | 
				
			||||||
    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
 | 
					    E143 = ("Labels for component '{name}' not initialized. Did you forget to "
 | 
				
			||||||
            "call add_label()?")
 | 
					            "call add_label()?")
 | 
				
			||||||
    E144 = ("Could not find parameter `{param}` when building the entity "
 | 
					 | 
				
			||||||
            "linker model.")
 | 
					 | 
				
			||||||
    E145 = ("Error reading `{param}` from input file.")
 | 
					    E145 = ("Error reading `{param}` from input file.")
 | 
				
			||||||
    E146 = ("Could not access `{path}`.")
 | 
					    E146 = ("Could not access `{path}`.")
 | 
				
			||||||
    E147 = ("Unexpected error in the {method} functionality of the "
 | 
					    E147 = ("Unexpected error in the {method} functionality of the "
 | 
				
			||||||
| 
						 | 
					@ -495,8 +443,6 @@ class Errors(object):
 | 
				
			||||||
            "the component matches the model being loaded.")
 | 
					            "the component matches the model being loaded.")
 | 
				
			||||||
    E150 = ("The language of the `nlp` object and the `vocab` should be the "
 | 
					    E150 = ("The language of the `nlp` object and the `vocab` should be the "
 | 
				
			||||||
            "same, but found '{nlp}' and '{vocab}' respectively.")
 | 
					            "same, but found '{nlp}' and '{vocab}' respectively.")
 | 
				
			||||||
    E151 = ("Trying to call nlp.update without required annotation types. "
 | 
					 | 
				
			||||||
            "Expected top-level keys: {exp}. Got: {unexp}.")
 | 
					 | 
				
			||||||
    E152 = ("The attribute {attr} is not supported for token patterns. "
 | 
					    E152 = ("The attribute {attr} is not supported for token patterns. "
 | 
				
			||||||
            "Please use the option validate=True with Matcher, PhraseMatcher, "
 | 
					            "Please use the option validate=True with Matcher, PhraseMatcher, "
 | 
				
			||||||
            "or EntityRuler for more details.")
 | 
					            "or EntityRuler for more details.")
 | 
				
			||||||
| 
						 | 
					@ -533,11 +479,6 @@ class Errors(object):
 | 
				
			||||||
            "that case.")
 | 
					            "that case.")
 | 
				
			||||||
    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
 | 
					    E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
 | 
				
			||||||
            "Current DocBin: {current}\nOther DocBin: {other}")
 | 
					            "Current DocBin: {current}\nOther DocBin: {other}")
 | 
				
			||||||
    E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
 | 
					 | 
				
			||||||
            "happen if the tagger was trained with a different set of "
 | 
					 | 
				
			||||||
            "morphological features. If you're using a pretrained model, make "
 | 
					 | 
				
			||||||
            "sure that your models are up to date:\npython -m spacy validate")
 | 
					 | 
				
			||||||
    E168 = ("Unknown field: {field}")
 | 
					 | 
				
			||||||
    E169 = ("Can't find module: {module}")
 | 
					    E169 = ("Can't find module: {module}")
 | 
				
			||||||
    E170 = ("Cannot apply transition {name}: invalid for the current state.")
 | 
					    E170 = ("Cannot apply transition {name}: invalid for the current state.")
 | 
				
			||||||
    E171 = ("Matcher.add received invalid on_match callback argument: expected "
 | 
					    E171 = ("Matcher.add received invalid on_match callback argument: expected "
 | 
				
			||||||
| 
						 | 
					@ -548,8 +489,6 @@ class Errors(object):
 | 
				
			||||||
    E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
 | 
					    E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
 | 
				
			||||||
            "Lookups containing the lemmatization tables. See the docs for "
 | 
					            "Lookups containing the lemmatization tables. See the docs for "
 | 
				
			||||||
            "details: https://spacy.io/api/lemmatizer#init")
 | 
					            "details: https://spacy.io/api/lemmatizer#init")
 | 
				
			||||||
    E174 = ("Architecture '{name}' not found in registry. Available "
 | 
					 | 
				
			||||||
            "names: {names}")
 | 
					 | 
				
			||||||
    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
 | 
					    E175 = ("Can't remove rule for unknown match pattern ID: {key}")
 | 
				
			||||||
    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
 | 
					    E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
 | 
				
			||||||
    E177 = ("Ill-formed IOB input detected: {tag}")
 | 
					    E177 = ("Ill-formed IOB input detected: {tag}")
 | 
				
			||||||
| 
						 | 
					@ -597,10 +536,19 @@ class Errors(object):
 | 
				
			||||||
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
					    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
				
			||||||
            "table, which contains {n_rows} vectors.")
 | 
					            "table, which contains {n_rows} vectors.")
 | 
				
			||||||
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
					    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
				
			||||||
    E200 = ("Specifying a base model with a pretrained component '{component}' "
 | 
					 | 
				
			||||||
            "can not be combined with adding a pretrained Tok2Vec layer.")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
 | 
				
			||||||
 | 
					            "array and {doc_length} for the Doc itself.")
 | 
				
			||||||
 | 
					    E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
 | 
				
			||||||
 | 
					    E973 = ("Unexpected type for NER data")
 | 
				
			||||||
 | 
					    E974 = ("Unknown {obj} attribute: {key}")
 | 
				
			||||||
 | 
					    E975 = ("The method Example.from_dict expects a Doc as first argument, "
 | 
				
			||||||
 | 
					            "but got {type}")
 | 
				
			||||||
 | 
					    E976 = ("The method Example.from_dict expects a dict as second argument, "
 | 
				
			||||||
 | 
					            "but received None.")
 | 
				
			||||||
 | 
					    E977 = ("Can not compare a MorphAnalysis with a string object. "
 | 
				
			||||||
 | 
					            "This is likely a bug in spaCy, so feel free to open an issue.")
 | 
				
			||||||
    E978 = ("The {method} method of component {name} takes a list of Example objects, "
 | 
					    E978 = ("The {method} method of component {name} takes a list of Example objects, "
 | 
				
			||||||
            "but found {types} instead.")
 | 
					            "but found {types} instead.")
 | 
				
			||||||
    E979 = ("Cannot convert {type} to an Example object.")
 | 
					    E979 = ("Cannot convert {type} to an Example object.")
 | 
				
			||||||
| 
						 | 
					@ -648,13 +596,8 @@ class Errors(object):
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
class TempErrors(object):
 | 
					class TempErrors(object):
 | 
				
			||||||
    T003 = ("Resizing pretrained Tagger models is not currently supported.")
 | 
					    T003 = ("Resizing pretrained Tagger models is not currently supported.")
 | 
				
			||||||
    T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
 | 
					 | 
				
			||||||
    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
 | 
					    T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
 | 
				
			||||||
            "issue tracker: http://github.com/explosion/spaCy/issues")
 | 
					            "issue tracker: http://github.com/explosion/spaCy/issues")
 | 
				
			||||||
    T008 = ("Bad configuration of Tagger. This is probably a bug within "
 | 
					 | 
				
			||||||
            "spaCy. We changed the name of an internal attribute for loading "
 | 
					 | 
				
			||||||
            "pretrained vectors, and the class has been passed the old name "
 | 
					 | 
				
			||||||
            "(pretrained_dims) but not the new name (pretrained_vectors).")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -45,7 +45,7 @@ class Corpus:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_examples(self, nlp, reference_docs, max_length=0):
 | 
					    def make_examples(self, nlp, reference_docs, max_length=0):
 | 
				
			||||||
        for reference in reference_docs:
 | 
					        for reference in reference_docs:
 | 
				
			||||||
            if max_length >= 1 and len(reference) >= max_length:
 | 
					            if len(reference) >= max_length >= 1:
 | 
				
			||||||
                if reference.is_sentenced:
 | 
					                if reference.is_sentenced:
 | 
				
			||||||
                    for ref_sent in reference.sents:
 | 
					                    for ref_sent in reference.sents:
 | 
				
			||||||
                        yield Example(
 | 
					                        yield Example(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,6 @@ import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..tokens import Token
 | 
					 | 
				
			||||||
from ..tokens.doc cimport Doc
 | 
					from ..tokens.doc cimport Doc
 | 
				
			||||||
from ..tokens.span cimport Span
 | 
					from ..tokens.span cimport Span
 | 
				
			||||||
from ..tokens.span import Span
 | 
					from ..tokens.span import Span
 | 
				
			||||||
| 
						 | 
					@ -11,9 +10,8 @@ from .align cimport Alignment
 | 
				
			||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 | 
					from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
 | 
				
			||||||
from .iob_utils import spans_from_biluo_tags
 | 
					from .iob_utils import spans_from_biluo_tags
 | 
				
			||||||
from .align import Alignment
 | 
					from .align import Alignment
 | 
				
			||||||
from ..errors import Errors, AlignmentError
 | 
					from ..errors import Errors, Warnings
 | 
				
			||||||
from ..syntax import nonproj
 | 
					from ..syntax import nonproj
 | 
				
			||||||
from ..util import get_words_and_spaces
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 | 
					cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 | 
				
			||||||
| 
						 | 
					@ -32,11 +30,10 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
 | 
				
			||||||
cdef class Example:
 | 
					cdef class Example:
 | 
				
			||||||
    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
 | 
					    def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
 | 
				
			||||||
        """ Doc can either be text, or an actual Doc """
 | 
					        """ Doc can either be text, or an actual Doc """
 | 
				
			||||||
        msg = "Example.__init__ got None for '{arg}'. Requires Doc."
 | 
					 | 
				
			||||||
        if predicted is None:
 | 
					        if predicted is None:
 | 
				
			||||||
            raise TypeError(msg.format(arg="predicted"))
 | 
					            raise TypeError(Errors.E972.format(arg="predicted"))
 | 
				
			||||||
        if reference is None:
 | 
					        if reference is None:
 | 
				
			||||||
            raise TypeError(msg.format(arg="reference"))
 | 
					            raise TypeError(Errors.E972.format(arg="reference"))
 | 
				
			||||||
        self.x = predicted
 | 
					        self.x = predicted
 | 
				
			||||||
        self.y = reference
 | 
					        self.y = reference
 | 
				
			||||||
        self._alignment = alignment
 | 
					        self._alignment = alignment
 | 
				
			||||||
| 
						 | 
					@ -64,9 +61,9 @@ cdef class Example:
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def from_dict(cls, Doc predicted, dict example_dict):
 | 
					    def from_dict(cls, Doc predicted, dict example_dict):
 | 
				
			||||||
        if example_dict is None:
 | 
					        if example_dict is None:
 | 
				
			||||||
            raise ValueError("Example.from_dict expected dict, received None")
 | 
					            raise ValueError(Errors.E976)
 | 
				
			||||||
        if not isinstance(predicted, Doc):
 | 
					        if not isinstance(predicted, Doc):
 | 
				
			||||||
            raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
 | 
					            raise TypeError(Errors.E975.format(type=type(predicted)))
 | 
				
			||||||
        example_dict = _fix_legacy_dict_data(example_dict)
 | 
					        example_dict = _fix_legacy_dict_data(example_dict)
 | 
				
			||||||
        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
 | 
					        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
 | 
				
			||||||
        if "ORTH" not in tok_dict:
 | 
					        if "ORTH" not in tok_dict:
 | 
				
			||||||
| 
						 | 
					@ -118,6 +115,7 @@ cdef class Example:
 | 
				
			||||||
        aligned_deps = [None] * self.x.length
 | 
					        aligned_deps = [None] * self.x.length
 | 
				
			||||||
        heads = [token.head.i for token in self.y]
 | 
					        heads = [token.head.i for token in self.y]
 | 
				
			||||||
        deps = [token.dep_ for token in self.y]
 | 
					        deps = [token.dep_ for token in self.y]
 | 
				
			||||||
 | 
					        if projectivize:
 | 
				
			||||||
            heads, deps = nonproj.projectivize(heads, deps)
 | 
					            heads, deps = nonproj.projectivize(heads, deps)
 | 
				
			||||||
        for cand_i in range(self.x.length):
 | 
					        for cand_i in range(self.x.length):
 | 
				
			||||||
            gold_i = cand_to_gold[cand_i]
 | 
					            gold_i = cand_to_gold[cand_i]
 | 
				
			||||||
| 
						 | 
					@ -245,11 +243,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
 | 
				
			||||||
            elif key == "cats":
 | 
					            elif key == "cats":
 | 
				
			||||||
                pass
 | 
					                pass
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                raise ValueError(f"Unknown doc attribute: {key}")
 | 
					                raise ValueError(Errors.E974.format(obj="doc", key=key))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for key, value in tok_annot.items():
 | 
					    for key, value in tok_annot.items():
 | 
				
			||||||
        if key not in IDS:
 | 
					        if key not in IDS:
 | 
				
			||||||
            raise ValueError(f"Unknown token attribute: {key}")
 | 
					            raise ValueError(Errors.E974.format(obj="token", key=key))
 | 
				
			||||||
        elif key in ["ORTH", "SPACY"]:
 | 
					        elif key in ["ORTH", "SPACY"]:
 | 
				
			||||||
            pass
 | 
					            pass
 | 
				
			||||||
        elif key == "HEAD":
 | 
					        elif key == "HEAD":
 | 
				
			||||||
| 
						 | 
					@ -289,7 +287,7 @@ def _add_entities_to_doc(doc, ner_data):
 | 
				
			||||||
        doc.ents = ner_data
 | 
					        doc.ents = ner_data
 | 
				
			||||||
        doc.ents = [span for span in ner_data if span.label_]
 | 
					        doc.ents = [span for span in ner_data if span.label_]
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        raise ValueError("Unexpected type for NER data")
 | 
					        raise ValueError(Errors.E973)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _parse_example_dict_data(example_dict):
 | 
					def _parse_example_dict_data(example_dict):
 | 
				
			||||||
| 
						 | 
					@ -341,7 +339,7 @@ def _fix_legacy_dict_data(example_dict):
 | 
				
			||||||
    if "HEAD" in token_dict and "SENT_START" in token_dict:
 | 
					    if "HEAD" in token_dict and "SENT_START" in token_dict:
 | 
				
			||||||
        # If heads are set, we don't also redundantly specify SENT_START.
 | 
					        # If heads are set, we don't also redundantly specify SENT_START.
 | 
				
			||||||
        token_dict.pop("SENT_START")
 | 
					        token_dict.pop("SENT_START")
 | 
				
			||||||
        warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set")
 | 
					        warnings.warn(Warnings.W092)
 | 
				
			||||||
    return {
 | 
					    return {
 | 
				
			||||||
        "token_annotation": token_dict,
 | 
					        "token_annotation": token_dict,
 | 
				
			||||||
        "doc_annotation": doc_dict
 | 
					        "doc_annotation": doc_dict
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -145,7 +145,7 @@ def json_to_annotations(doc):
 | 
				
			||||||
        example["doc_annotation"] = dict(
 | 
					        example["doc_annotation"] = dict(
 | 
				
			||||||
            cats=cats,
 | 
					            cats=cats,
 | 
				
			||||||
            entities=ner_tags,
 | 
					            entities=ner_tags,
 | 
				
			||||||
            links=paragraph.get("links", [])   # TODO: fix/test
 | 
					            links=paragraph.get("links", [])
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        yield example
 | 
					        yield example
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -107,9 +107,9 @@ cdef class Morphology:
 | 
				
			||||||
        Returns the hash of the new analysis.
 | 
					        Returns the hash of the new analysis.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef MorphAnalysisC* tag_ptr
 | 
					        cdef MorphAnalysisC* tag_ptr
 | 
				
			||||||
 | 
					        if isinstance(features, str):
 | 
				
			||||||
            if features == self.EMPTY_MORPH:
 | 
					            if features == self.EMPTY_MORPH:
 | 
				
			||||||
                features = ""
 | 
					                features = ""
 | 
				
			||||||
        if isinstance(features, str):
 | 
					 | 
				
			||||||
            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
 | 
					            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
 | 
				
			||||||
            if tag_ptr != NULL:
 | 
					            if tag_ptr != NULL:
 | 
				
			||||||
                return tag_ptr.key
 | 
					                return tag_ptr.key
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -70,7 +70,7 @@ class SimpleNER(Pipe):
 | 
				
			||||||
    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
 | 
					    def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
 | 
				
			||||||
        if not any(_has_ner(eg) for eg in examples):
 | 
					        if not any(_has_ner(eg) for eg in examples):
 | 
				
			||||||
            return 0
 | 
					            return 0
 | 
				
			||||||
        docs = [eg.doc for eg in examples]
 | 
					        docs = [eg.predicted for eg in examples]
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        scores, bp_scores = self.model.begin_update(docs)
 | 
					        scores, bp_scores = self.model.begin_update(docs)
 | 
				
			||||||
        loss, d_scores = self.get_loss(examples, scores)
 | 
					        loss, d_scores = self.get_loss(examples, scores)
 | 
				
			||||||
| 
						 | 
					@ -89,7 +89,8 @@ class SimpleNER(Pipe):
 | 
				
			||||||
        d_scores = []
 | 
					        d_scores = []
 | 
				
			||||||
        truths = []
 | 
					        truths = []
 | 
				
			||||||
        for eg in examples:
 | 
					        for eg in examples:
 | 
				
			||||||
            gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner]
 | 
					            tags = eg.get_aligned("TAG", as_string=True)
 | 
				
			||||||
 | 
					            gold_tags = [(tag if tag != "-" else None) for tag in tags]
 | 
				
			||||||
            if not self.is_biluo:
 | 
					            if not self.is_biluo:
 | 
				
			||||||
                gold_tags = biluo_to_iob(gold_tags)
 | 
					                gold_tags = biluo_to_iob(gold_tags)
 | 
				
			||||||
            truths.append(gold_tags)
 | 
					            truths.append(gold_tags)
 | 
				
			||||||
| 
						 | 
					@ -128,8 +129,8 @@ class SimpleNER(Pipe):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _has_ner(eg):
 | 
					def _has_ner(example):
 | 
				
			||||||
    for ner_tag in eg.gold.ner:
 | 
					    for ner_tag in example.get_aligned_ner():
 | 
				
			||||||
        if ner_tag != "-" and ner_tag is not None:
 | 
					        if ner_tag != "-" and ner_tag is not None:
 | 
				
			||||||
            return True
 | 
					            return True
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -220,8 +220,11 @@ class TrainingSchema(BaseModel):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ProjectConfigAsset(BaseModel):
 | 
					class ProjectConfigAsset(BaseModel):
 | 
				
			||||||
 | 
					    # fmt: off
 | 
				
			||||||
    dest: StrictStr = Field(..., title="Destination of downloaded asset")
 | 
					    dest: StrictStr = Field(..., title="Destination of downloaded asset")
 | 
				
			||||||
    url: StrictStr = Field(..., title="URL of asset")
 | 
					    url: StrictStr = Field(..., title="URL of asset")
 | 
				
			||||||
 | 
					    checksum: str = Field(None, title="MD5 hash of file", regex=r"([a-fA-F\d]{32})")
 | 
				
			||||||
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ProjectConfigCommand(BaseModel):
 | 
					class ProjectConfigCommand(BaseModel):
 | 
				
			||||||
| 
						 | 
					@ -229,11 +232,15 @@ class ProjectConfigCommand(BaseModel):
 | 
				
			||||||
    name: StrictStr = Field(..., title="Name of command")
 | 
					    name: StrictStr = Field(..., title="Name of command")
 | 
				
			||||||
    help: Optional[StrictStr] = Field(None, title="Command description")
 | 
					    help: Optional[StrictStr] = Field(None, title="Command description")
 | 
				
			||||||
    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
 | 
					    script: List[StrictStr] = Field([], title="List of CLI commands to run, in order")
 | 
				
			||||||
    dvc_deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
 | 
					    deps: List[StrictStr] = Field([], title="Data Version Control dependencies")
 | 
				
			||||||
    dvc_outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
 | 
					    outputs: List[StrictStr] = Field([], title="Data Version Control outputs")
 | 
				
			||||||
    dvc_outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
 | 
					    outputs_no_cache: List[StrictStr] = Field([], title="Data Version Control outputs (no cache)")
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class Config:
 | 
				
			||||||
 | 
					        title = "A single named command specified in a project config"
 | 
				
			||||||
 | 
					        extra = "forbid"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ProjectConfigSchema(BaseModel):
 | 
					class ProjectConfigSchema(BaseModel):
 | 
				
			||||||
    # fmt: off
 | 
					    # fmt: off
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -230,15 +230,14 @@ def test_json2docs_no_ner(en_vocab):
 | 
				
			||||||
        Doc(
 | 
					        Doc(
 | 
				
			||||||
            doc.vocab,
 | 
					            doc.vocab,
 | 
				
			||||||
            words=[w.text for w in doc],
 | 
					            words=[w.text for w in doc],
 | 
				
			||||||
            spaces=[bool(w.whitespace_) for w in doc]
 | 
					            spaces=[bool(w.whitespace_) for w in doc],
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
        doc
 | 
					        doc,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    ner_tags = eg.get_aligned_ner()
 | 
					    ner_tags = eg.get_aligned_ner()
 | 
				
			||||||
    assert ner_tags == [None, None, None, None, None]
 | 
					    assert ner_tags == [None, None, None, None, None]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_split_sentences(en_vocab):
 | 
					def test_split_sentences(en_vocab):
 | 
				
			||||||
    words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
 | 
					    words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words)
 | 
					    doc = Doc(en_vocab, words=words)
 | 
				
			||||||
| 
						 | 
					@ -283,8 +282,8 @@ def test_split_sentences(en_vocab):
 | 
				
			||||||
    assert split_examples[1].text == "had loads of fun "
 | 
					    assert split_examples[1].text == "had loads of fun "
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
					@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 | 
				
			||||||
    # one-to-many
 | 
					def test_gold_biluo_one_to_many(en_vocab, en_tokenizer):
 | 
				
			||||||
    words = ["I", "flew to", "San Francisco Valley", "."]
 | 
					    words = ["I", "flew to", "San Francisco Valley", "."]
 | 
				
			||||||
    spaces = [True, True, False, False]
 | 
					    spaces = [True, True, False, False]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
| 
						 | 
					@ -292,9 +291,28 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
					    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
				
			||||||
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
    ner_tags = example.get_aligned_ner()
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == ["O", "O", "U-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    entities = [
 | 
				
			||||||
 | 
					        (len("I "), len("I flew to"), "ORG"),
 | 
				
			||||||
 | 
					        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == ["O", "U-ORG", "U-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    entities = [
 | 
				
			||||||
 | 
					        (len("I "), len("I flew"), "ORG"),
 | 
				
			||||||
 | 
					        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert ner_tags == ["O", None, "U-LOC", "O"]
 | 
					    assert ner_tags == ["O", None, "U-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # many-to-one
 | 
					
 | 
				
			||||||
 | 
					def test_gold_biluo_many_to_one(en_vocab, en_tokenizer):
 | 
				
			||||||
    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
					    words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
				
			||||||
    spaces = [True, True, True, True, True, False, False]
 | 
					    spaces = [True, True, True, True, True, False, False]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
| 
						 | 
					@ -304,31 +322,38 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    ner_tags = example.get_aligned_ner()
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 | 
					    assert ner_tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # misaligned
 | 
					    entities = [
 | 
				
			||||||
 | 
					        (len("I "), len("I flew to"), "ORG"),
 | 
				
			||||||
 | 
					        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    gold_words = ["I", "flew to", "San Francisco Valley", "."]
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == ["O", "B-ORG", "L-ORG", "B-LOC", "I-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.xfail(reason="Alignment should be fixed after example refactor")
 | 
				
			||||||
 | 
					def test_gold_biluo_misaligned(en_vocab, en_tokenizer):
 | 
				
			||||||
    words = ["I flew", "to", "San Francisco", "Valley", "."]
 | 
					    words = ["I flew", "to", "San Francisco", "Valley", "."]
 | 
				
			||||||
    spaces = [True, True, True, False, False]
 | 
					    spaces = [True, True, True, False, False]
 | 
				
			||||||
    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
					    doc = Doc(en_vocab, words=words, spaces=spaces)
 | 
				
			||||||
    offset_start = len("I flew to ")
 | 
					    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
				
			||||||
    offset_end = len("I flew to San Francisco Valley")
 | 
					 | 
				
			||||||
    entities = [(offset_start, offset_end, "LOC")]
 | 
					 | 
				
			||||||
    links = {(offset_start, offset_end): {"Q816843": 1.0}}
 | 
					 | 
				
			||||||
    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
 | 
					    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
 | 
				
			||||||
    example = Example.from_dict(
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
        doc, {"words": gold_words, "entities": entities, "links": links}
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    ner_tags = example.get_aligned_ner()
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert ner_tags == [None, "O", "B-LOC", "L-LOC", "O"]
 | 
					    assert ner_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
 | 
				
			||||||
    #assert example.get_aligned("ENT_KB_ID", as_string=True) == [
 | 
					 | 
				
			||||||
    #    "",
 | 
					 | 
				
			||||||
    #    "",
 | 
					 | 
				
			||||||
    #    "Q816843",
 | 
					 | 
				
			||||||
    #    "Q816843",
 | 
					 | 
				
			||||||
    #    "",
 | 
					 | 
				
			||||||
    #]
 | 
					 | 
				
			||||||
    #assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {
 | 
					 | 
				
			||||||
    #    "Q816843": 1.0
 | 
					 | 
				
			||||||
    #}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    entities = [
 | 
				
			||||||
 | 
					        (len("I "), len("I flew to"), "ORG"),
 | 
				
			||||||
 | 
					        (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
 | 
				
			||||||
 | 
					    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
				
			||||||
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
 | 
					    assert ner_tags == [None, None, "B-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
 | 
				
			||||||
    # additional whitespace tokens in GoldParse words
 | 
					    # additional whitespace tokens in GoldParse words
 | 
				
			||||||
    words, spaces = get_words_and_spaces(
 | 
					    words, spaces = get_words_and_spaces(
 | 
				
			||||||
        ["I", "flew", "to", "San Francisco", "Valley", "."],
 | 
					        ["I", "flew", "to", "San Francisco", "Valley", "."],
 | 
				
			||||||
| 
						 | 
					@ -344,7 +369,8 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
				
			||||||
    ner_tags = example.get_aligned_ner()
 | 
					    ner_tags = example.get_aligned_ner()
 | 
				
			||||||
    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
 | 
					    assert ner_tags == ["O", "O", "O", "O", "B-LOC", "L-LOC", "O"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # from issue #4791
 | 
					
 | 
				
			||||||
 | 
					def test_gold_biluo_4791(en_vocab, en_tokenizer):
 | 
				
			||||||
    doc = en_tokenizer("I'll return the ₹54 amount")
 | 
					    doc = en_tokenizer("I'll return the ₹54 amount")
 | 
				
			||||||
    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
 | 
					    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
 | 
				
			||||||
    gold_spaces = [False, True, True, True, False, True, False]
 | 
					    gold_spaces = [False, True, True, True, False, True, False]
 | 
				
			||||||
| 
						 | 
					@ -593,7 +619,6 @@ def test_tuple_format_implicit_invalid():
 | 
				
			||||||
        _train(train_data)
 | 
					        _train(train_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
def _train(train_data):
 | 
					def _train(train_data):
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    ner = nlp.create_pipe("ner")
 | 
					    ner = nlp.create_pipe("ner")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,15 +1,14 @@
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
import tempfile
 | 
					import tempfile
 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from pathlib import Path
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import Errors
 | 
					from spacy import Errors
 | 
				
			||||||
from spacy.tokens import Doc, Span
 | 
					from spacy.tokens import Doc, Span
 | 
				
			||||||
from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
 | 
					from spacy.attrs import POS, TAG, HEAD, DEP, LEMMA, MORPH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					from spacy.util import make_tempdir  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@contextlib.contextmanager
 | 
					@contextlib.contextmanager
 | 
				
			||||||
| 
						 | 
					@ -19,13 +18,6 @@ def make_tempfile(mode="r"):
 | 
				
			||||||
    f.close()
 | 
					    f.close()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@contextlib.contextmanager
 | 
					 | 
				
			||||||
def make_tempdir():
 | 
					 | 
				
			||||||
    d = Path(tempfile.mkdtemp())
 | 
					 | 
				
			||||||
    yield d
 | 
					 | 
				
			||||||
    shutil.rmtree(str(d))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_doc(
 | 
					def get_doc(
 | 
				
			||||||
    vocab,
 | 
					    vocab,
 | 
				
			||||||
    words=[],
 | 
					    words=[],
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH")
 | 
					ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DocBin(object):
 | 
					class DocBin(object):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -816,7 +816,7 @@ cdef class Doc:
 | 
				
			||||||
        cdef TokenC* tokens = self.c
 | 
					        cdef TokenC* tokens = self.c
 | 
				
			||||||
        cdef int length = len(array)
 | 
					        cdef int length = len(array)
 | 
				
			||||||
        if length != len(self):
 | 
					        if length != len(self):
 | 
				
			||||||
            raise ValueError("Cannot set array values longer than the document.")
 | 
					            raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Get set up for fast loading
 | 
					        # Get set up for fast loading
 | 
				
			||||||
        cdef Pool mem = Pool()
 | 
					        cdef Pool mem = Pool()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..vocab cimport Vocab
 | 
					from ..vocab cimport Vocab
 | 
				
			||||||
from ..typedefs cimport hash_t, attr_t
 | 
					from ..typedefs cimport hash_t, attr_t
 | 
				
			||||||
from ..morphology cimport list_features, check_feature, get_by_field
 | 
					from ..morphology cimport list_features, check_feature, get_by_field
 | 
				
			||||||
| 
						 | 
					@ -49,6 +50,8 @@ cdef class MorphAnalysis:
 | 
				
			||||||
        return self.key
 | 
					        return self.key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __eq__(self, other):
 | 
					    def __eq__(self, other):
 | 
				
			||||||
 | 
					        if isinstance(other, str):
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E977)
 | 
				
			||||||
        return self.key == other.key
 | 
					        return self.key == other.key
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __ne__(self, other):
 | 
					    def __ne__(self, other):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,6 +19,9 @@ from packaging.specifiers import SpecifierSet, InvalidSpecifier
 | 
				
			||||||
from packaging.version import Version, InvalidVersion
 | 
					from packaging.version import Version, InvalidVersion
 | 
				
			||||||
import subprocess
 | 
					import subprocess
 | 
				
			||||||
from contextlib import contextmanager
 | 
					from contextlib import contextmanager
 | 
				
			||||||
 | 
					import tempfile
 | 
				
			||||||
 | 
					import shutil
 | 
				
			||||||
 | 
					import hashlib
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
| 
						 | 
					@ -455,6 +458,37 @@ def working_dir(path: Union[str, Path]) -> None:
 | 
				
			||||||
        os.chdir(prev_cwd)
 | 
					        os.chdir(prev_cwd)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@contextmanager
 | 
				
			||||||
 | 
					def make_tempdir():
 | 
				
			||||||
 | 
					    """Execute a block in a temporary directory and remove the directory and
 | 
				
			||||||
 | 
					    its contents at the end of the with block.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    YIELDS (Path): The path of the temp directory.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    d = Path(tempfile.mkdtemp())
 | 
				
			||||||
 | 
					    yield d
 | 
				
			||||||
 | 
					    shutil.rmtree(str(d))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_hash(data) -> str:
 | 
				
			||||||
 | 
					    """Get the hash for a JSON-serializable object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    data: The data to hash.
 | 
				
			||||||
 | 
					    RETURNS (str): The hash.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    data_str = srsly.json_dumps(data, sort_keys=True).encode("utf8")
 | 
				
			||||||
 | 
					    return hashlib.md5(data_str).hexdigest()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_checksum(path: Union[Path, str]) -> str:
 | 
				
			||||||
 | 
					    """Get the checksum for a file given its file path.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    path (Union[Path, str]): The file path.
 | 
				
			||||||
 | 
					    RETURNS (str): The checksum.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    return hashlib.md5(Path(path).read_bytes()).hexdigest()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def is_in_jupyter():
 | 
					def is_in_jupyter():
 | 
				
			||||||
    """Check if user is running spaCy from a Jupyter notebook by detecting the
 | 
					    """Check if user is running spaCy from a Jupyter notebook by detecting the
 | 
				
			||||||
    IPython kernel. Mainly used for the displaCy visualizer.
 | 
					    IPython kernel. Mainly used for the displaCy visualizer.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user