mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	Merge pull request #5617 from explosion/chore/tidy-auto-format
This commit is contained in:
		
						commit
						dbe9c29f61
					
				|  | @ -24,8 +24,8 @@ from ..gold import Example | |||
|     output_dir=("Directory to write models to on each epoch", "positional", None, Path), | ||||
|     config_path=("Path to config file", "positional", None, Path), | ||||
|     use_gpu=("Use GPU", "option", "g", int), | ||||
|     resume_path=("Path to pretrained weights from which to resume pretraining", "option","r", Path), | ||||
|     epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.","option", "er", int), | ||||
|     resume_path=("Path to pretrained weights from which to resume pretraining", "option", "r", Path), | ||||
|     epoch_resume=("The epoch to resume counting from when using '--resume_path'. Prevents unintended overwriting of existing weight files.", "option", "er", int), | ||||
|     # fmt: on | ||||
| ) | ||||
| def pretrain( | ||||
|  |  | |||
|  | @ -3,7 +3,6 @@ from timeit import default_timer as timer | |||
| 
 | ||||
| import srsly | ||||
| from pydantic import BaseModel, FilePath | ||||
| import plac | ||||
| import tqdm | ||||
| from pathlib import Path | ||||
| from wasabi import msg | ||||
|  | @ -16,7 +15,9 @@ from ..gold import GoldCorpus | |||
| from ..lookups import Lookups | ||||
| from .. import util | ||||
| from ..errors import Errors | ||||
| from ..ml import models  # don't remove - required to load the built-in architectures | ||||
| 
 | ||||
| # Don't remove - required to load the built-in architectures | ||||
| from ..ml import models  # noqa: F401 | ||||
| 
 | ||||
| registry = util.registry | ||||
| 
 | ||||
|  | @ -114,33 +115,19 @@ class ConfigSchema(BaseModel): | |||
|         extra = "allow" | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     # fmt: off | ||||
|     train_path=("Location of JSON-formatted training data", "positional", None, Path), | ||||
|     dev_path=("Location of JSON-formatted development data", "positional", None, Path), | ||||
|     config_path=("Path to config file", "positional", None, Path), | ||||
|     output_path=("Output directory to store model in", "option", "o", Path), | ||||
|     init_tok2vec=( | ||||
|     "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", | ||||
|     Path), | ||||
|     raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path), | ||||
|     verbose=("Display more information for debugging purposes", "flag", "VV", bool), | ||||
|     use_gpu=("Use GPU", "option", "g", int), | ||||
|     tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), | ||||
|     omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), | ||||
|     # fmt: on | ||||
| ) | ||||
| def train_cli( | ||||
|     train_path, | ||||
|     dev_path, | ||||
|     config_path, | ||||
|     output_path=None, | ||||
|     init_tok2vec=None, | ||||
|     raw_text=None, | ||||
|     verbose=False, | ||||
|     use_gpu=-1, | ||||
|     tag_map_path=None, | ||||
|     omit_extra_lookups=False, | ||||
|     # fmt: off | ||||
|     train_path: ("Location of JSON-formatted training data", "positional", None, Path), | ||||
|     dev_path: ("Location of JSON-formatted development data", "positional", None, Path), | ||||
|     config_path: ("Path to config file", "positional", None, Path), | ||||
|     output_path: ("Output directory to store model in", "option", "o", Path) = None, | ||||
|     init_tok2vec: ("Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v", Path) = None, | ||||
|     raw_text: ("Path to jsonl file with unlabelled text documents.", "option", "rt", Path) = None, | ||||
|     verbose: ("Display more information for debugging purposes", "flag", "VV", bool) = False, | ||||
|     use_gpu: ("Use GPU", "option", "g", int) = -1, | ||||
|     tag_map_path: ("Location of JSON-formatted tag map", "option", "tm", Path) = None, | ||||
|     omit_extra_lookups: ("Don't include extra lookups in model", "flag", "OEL", bool) = False, | ||||
|     # fmt: on | ||||
| ): | ||||
|     """ | ||||
|     Train or update a spaCy model. Requires data to be formatted in spaCy's | ||||
|  | @ -212,7 +199,7 @@ def train( | |||
|     config = util.load_config(config_path, create_objects=False) | ||||
|     util.fix_random_seed(config["training"]["seed"]) | ||||
|     if config["training"].get("use_pytorch_for_gpu_memory"): | ||||
|         # It feels kind of weird to not have a default for this.  | ||||
|         # It feels kind of weird to not have a default for this. | ||||
|         use_pytorch_for_gpu_memory() | ||||
|     nlp_config = config["nlp"] | ||||
|     config = util.load_config(config_path, create_objects=True) | ||||
|  | @ -227,7 +214,9 @@ def train( | |||
|     # verify textcat config | ||||
|     if "textcat" in nlp_config["pipeline"]: | ||||
|         textcat_labels = set(nlp.get_pipe("textcat").labels) | ||||
|         textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"]["exclusive_classes"] | ||||
|         textcat_multilabel = not nlp_config["pipeline"]["textcat"]["model"][ | ||||
|             "exclusive_classes" | ||||
|         ] | ||||
| 
 | ||||
|         # check whether the setting 'exclusive_classes' corresponds to the provided training data | ||||
|         if textcat_multilabel: | ||||
|  | @ -255,7 +244,9 @@ def train( | |||
|                         "to 'false' in the config to train a classifier with classes " | ||||
|                         "that are not mutually exclusive." | ||||
|                     ) | ||||
|         msg.info(f"Initialized textcat component for {len(textcat_labels)} unique labels") | ||||
|         msg.info( | ||||
|             f"Initialized textcat component for {len(textcat_labels)} unique labels" | ||||
|         ) | ||||
|         nlp.get_pipe("textcat").labels = tuple(textcat_labels) | ||||
| 
 | ||||
|         # if 'positive_label' is provided: double check whether it's in the data and the task is binary | ||||
|  | @ -281,9 +272,7 @@ def train( | |||
|         nlp.resume_training() | ||||
|     else: | ||||
|         msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}") | ||||
|         nlp.begin_training( | ||||
|             lambda: corpus.train_examples | ||||
|         ) | ||||
|         nlp.begin_training(lambda: corpus.train_examples) | ||||
| 
 | ||||
|     # Update tag map with provided mapping | ||||
|     nlp.vocab.morphology.tag_map.update(tag_map) | ||||
|  | @ -310,8 +299,7 @@ def train( | |||
|             tok2vec = tok2vec.get(subpath) | ||||
|         if not tok2vec: | ||||
|             msg.fail( | ||||
|                 f"Could not locate the tok2vec model at {tok2vec_path}.", | ||||
|                 exits=1, | ||||
|                 f"Could not locate the tok2vec model at {tok2vec_path}.", exits=1, | ||||
|             ) | ||||
|         tok2vec.from_bytes(weights_data) | ||||
| 
 | ||||
|  | @ -376,7 +364,7 @@ def create_train_batches(nlp, corpus, cfg): | |||
|         train_examples = list( | ||||
|             corpus.train_dataset( | ||||
|                 nlp, | ||||
|                 noise_level=0.0, # I think this is deprecated? | ||||
|                 noise_level=0.0,  # I think this is deprecated? | ||||
|                 orth_variant_level=cfg["orth_variant_level"], | ||||
|                 gold_preproc=cfg["gold_preproc"], | ||||
|                 max_length=cfg["max_length"], | ||||
|  | @ -429,7 +417,11 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): | |||
|         try: | ||||
|             weighted_score = sum(scores[s] * weights.get(s, 0.0) for s in weights) | ||||
|         except KeyError as e: | ||||
|             raise KeyError(Errors.E983.format(dict_name='score_weights', key=str(e), keys=list(scores.keys()))) | ||||
|             raise KeyError( | ||||
|                 Errors.E983.format( | ||||
|                     dict_name="score_weights", key=str(e), keys=list(scores.keys()) | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|         scores["speed"] = wps | ||||
|         return weighted_score, scores | ||||
|  | @ -578,15 +570,25 @@ def setup_printer(training, nlp): | |||
|             ] | ||||
|         except KeyError as e: | ||||
|             raise KeyError( | ||||
|                 Errors.E983.format(dict_name='scores (losses)', key=str(e), keys=list(info["losses"].keys()))) | ||||
|                 Errors.E983.format( | ||||
|                     dict_name="scores (losses)", | ||||
|                     key=str(e), | ||||
|                     keys=list(info["losses"].keys()), | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|         try: | ||||
|             scores = [ | ||||
|                 "{0:.2f}".format(float(info["other_scores"][col])) | ||||
|                 for col in score_cols | ||||
|                 "{0:.2f}".format(float(info["other_scores"][col])) for col in score_cols | ||||
|             ] | ||||
|         except KeyError as e: | ||||
|             raise KeyError(Errors.E983.format(dict_name='scores (other)', key=str(e), keys=list(info["other_scores"].keys()))) | ||||
|             raise KeyError( | ||||
|                 Errors.E983.format( | ||||
|                     dict_name="scores (other)", | ||||
|                     key=str(e), | ||||
|                     keys=list(info["other_scores"].keys()), | ||||
|                 ) | ||||
|             ) | ||||
|         data = ( | ||||
|             [info["step"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))] | ||||
|         ) | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN | ||||
| from .errors import Errors | ||||
| from .lookups import Lookups | ||||
| from .parts_of_speech import NAMES as UPOS_NAMES | ||||
|  | @ -51,7 +50,13 @@ class Lemmatizer(object): | |||
|         index_table = self.lookups.get_table("lemma_index", {}) | ||||
|         exc_table = self.lookups.get_table("lemma_exc", {}) | ||||
|         rules_table = self.lookups.get_table("lemma_rules", {}) | ||||
|         if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))): | ||||
|         if not any( | ||||
|             ( | ||||
|                 index_table.get(univ_pos), | ||||
|                 exc_table.get(univ_pos), | ||||
|                 rules_table.get(univ_pos), | ||||
|             ) | ||||
|         ): | ||||
|             if univ_pos == "propn": | ||||
|                 return [string] | ||||
|             else: | ||||
|  |  | |||
|  | @ -1 +1 @@ | |||
| from .models import * | ||||
| from .models import *  # noqa: F401, F403 | ||||
|  |  | |||
|  | @ -1,11 +1,8 @@ | |||
| """Thinc layer to do simpler transition-based parsing, NER, etc.""" | ||||
| from typing import List, Tuple, Dict, Optional | ||||
| from typing import Dict, Optional | ||||
| import numpy | ||||
| from thinc.api import Ops, Model, with_array, softmax_activation, padded2list | ||||
| from thinc.api import to_numpy | ||||
| from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d | ||||
| 
 | ||||
| from ..tokens import Doc | ||||
| from thinc.api import Model | ||||
| from thinc.types import Padded, Floats3d | ||||
| 
 | ||||
| 
 | ||||
| def BILUO() -> Model[Padded, Padded]: | ||||
|  | @ -14,11 +11,11 @@ def BILUO() -> Model[Padded, Padded]: | |||
|         forward, | ||||
|         init=init, | ||||
|         dims={"nO": None}, | ||||
|         attrs={"get_num_actions": get_num_actions} | ||||
|         attrs={"get_num_actions": get_num_actions}, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): | ||||
| def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): | ||||
|     if X is not None and Y is not None: | ||||
|         if X.data.shape != Y.data.shape: | ||||
|             # TODO: Fix error | ||||
|  | @ -49,12 +46,12 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): | |||
|     masks = model.ops.alloc3f(*Y.shape) | ||||
|     max_value = Xp.data.max() | ||||
|     for t in range(Xp.data.shape[0]): | ||||
|         is_last = (Xp.lengths < (t+2)).astype("i") | ||||
|         is_last = (Xp.lengths < (t + 2)).astype("i") | ||||
|         masks[t] = valid_transitions[is_last, prev_actions] | ||||
|         # Don't train the out-of-bounds sequences. | ||||
|         masks[t, Xp.size_at_t[t]:] = 0 | ||||
|         masks[t, Xp.size_at_t[t] :] = 0 | ||||
|         # Valid actions get 0*10e8, invalid get large negative value | ||||
|         Y[t] = Xp.data[t] + ((masks[t]-1) * max_value * 10) | ||||
|         Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10) | ||||
|         prev_actions = Y[t].argmax(axis=-1) | ||||
| 
 | ||||
|     def backprop_biluo(dY: Padded) -> Padded: | ||||
|  |  | |||
|  | @ -1,9 +1,7 @@ | |||
| """Thinc layer to do simpler transition-based parsing, NER, etc.""" | ||||
| from typing import List, Tuple, Dict, Optional | ||||
| from thinc.api import Ops, Model, with_array, softmax_activation, padded2list | ||||
| from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d | ||||
| 
 | ||||
| from ..tokens import Doc | ||||
| from typing import Dict, Optional | ||||
| from thinc.api import Ops, Model | ||||
| from thinc.types import Padded, Floats3d | ||||
| 
 | ||||
| 
 | ||||
| def IOB() -> Model[Padded, Padded]: | ||||
|  | @ -12,11 +10,11 @@ def IOB() -> Model[Padded, Padded]: | |||
|         forward, | ||||
|         init=init, | ||||
|         dims={"nO": None}, | ||||
|         attrs={"get_num_actions": get_num_actions} | ||||
|         attrs={"get_num_actions": get_num_actions}, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def init(model, X: Optional[Padded]=None, Y: Optional[Padded]=None): | ||||
| def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): | ||||
|     if X is not None and Y is not None: | ||||
|         if X.data.shape != Y.data.shape: | ||||
|             # TODO: Fix error | ||||
|  | @ -48,14 +46,14 @@ def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool): | |||
|     for t in range(Xp.data.shape[0]): | ||||
|         masks[t] = valid_transitions[prev_actions] | ||||
|         # Don't train the out-of-bounds sequences. | ||||
|         masks[t, Xp.size_at_t[t]:] = 0 | ||||
|         masks[t, Xp.size_at_t[t] :] = 0 | ||||
|         # Valid actions get 0*10e8, invalid get -1*10e8 | ||||
|         Y[t] = Xp.data[t] + ((masks[t]-1) * 10e8) | ||||
|         Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8) | ||||
|         prev_actions = Y[t].argmax(axis=-1) | ||||
| 
 | ||||
|     def backprop_biluo(dY: Padded) -> Padded: | ||||
|         # Masking the gradient seems to do poorly here. But why? | ||||
|         #dY.data *= masks | ||||
|         # dY.data *= masks | ||||
|         return dY | ||||
| 
 | ||||
|     return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo | ||||
|  | @ -83,10 +81,10 @@ def _get_transition_table( | |||
|     B_range = ops.xp.arange(B_start, B_end) | ||||
|     I_range = ops.xp.arange(I_start, I_end) | ||||
|     # B and O are always valid | ||||
|     table[:, B_start : B_end] = 1 | ||||
|     table[:, B_start:B_end] = 1 | ||||
|     table[:, O_action] = 1 | ||||
|     # I can only follow a matching B | ||||
|     table[B_range, I_range] = 1 | ||||
|   | ||||
| 
 | ||||
|     _cache[n_actions] = table | ||||
|     return table | ||||
|  |  | |||
|  | @ -84,7 +84,7 @@ def _backprop_precomputable_affine_padding(model, dY, ids): | |||
|     # | ||||
|     # (ids < 0).T @ dY | ||||
|     mask = model.ops.asarray(ids < 0, dtype="f") | ||||
|     d_pad = model.ops.gemm(mask, dY.reshape(nB, nO*nP), trans1=True) | ||||
|     d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True) | ||||
|     return d_pad.reshape((1, nF, nO, nP)) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .entity_linker import *  # noqa | ||||
| from .parser import *  # noqa | ||||
| from .simple_ner import * | ||||
| from .simple_ner import *  # noqa | ||||
| from .tagger import *  # noqa | ||||
| from .textcat import *  # noqa | ||||
| from .tok2vec import *  # noqa | ||||
|  |  | |||
|  | @ -7,7 +7,12 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None): | |||
|     softmax = Softmax(nO=nO, nI=token_vector_width * 2) | ||||
|     model = chain( | ||||
|         tok2vec, | ||||
|         Maxout(nO=token_vector_width * 2, nI=token_vector_width, nP=maxout_pieces, dropout=0.0), | ||||
|         Maxout( | ||||
|             nO=token_vector_width * 2, | ||||
|             nI=token_vector_width, | ||||
|             nP=maxout_pieces, | ||||
|             dropout=0.0, | ||||
|         ), | ||||
|         LayerNorm(token_vector_width * 2), | ||||
|         softmax, | ||||
|     ) | ||||
|  | @ -20,7 +25,11 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, nO=None): | |||
|     # nO = vocab.vectors.data.shape[1] | ||||
|     output_layer = chain( | ||||
|         Maxout( | ||||
|             nO=nO, nI=tok2vec.get_dim("nO"), nP=maxout_pieces, normalize=True, dropout=0.0 | ||||
|             nO=nO, | ||||
|             nI=tok2vec.get_dim("nO"), | ||||
|             nP=maxout_pieces, | ||||
|             normalize=True, | ||||
|             dropout=0.0, | ||||
|         ), | ||||
|         Linear(nO=nO, nI=nO, init_W=zero_init), | ||||
|     ) | ||||
|  | @ -39,7 +48,9 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): | |||
|     def mlm_forward(model, docs, is_train): | ||||
|         mask, docs = _apply_mask(docs, random_words, mask_prob=mask_prob) | ||||
|         mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) | ||||
|         output, backprop = model.get_ref("wrapped-model").begin_update(docs)  # drop=drop | ||||
|         output, backprop = model.get_ref("wrapped-model").begin_update( | ||||
|             docs | ||||
|         )  # drop=drop | ||||
| 
 | ||||
|         def mlm_backward(d_output): | ||||
|             d_output *= 1 - mask | ||||
|  |  | |||
|  | @ -16,18 +16,14 @@ def build_tb_parser_model( | |||
|     nO=None, | ||||
| ): | ||||
|     t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None | ||||
|     tok2vec = chain( | ||||
|         tok2vec, | ||||
|         with_array(Linear(hidden_width, t2v_width)), | ||||
|         list2array(), | ||||
|     ) | ||||
|     tok2vec = chain(tok2vec, with_array(Linear(hidden_width, t2v_width)), list2array(),) | ||||
|     tok2vec.set_dim("nO", hidden_width) | ||||
| 
 | ||||
|     lower = PrecomputableAffine( | ||||
|         nO=hidden_width if use_upper else nO, | ||||
|         nF=nr_feature_tokens, | ||||
|         nI=tok2vec.get_dim("nO"), | ||||
|         nP=maxout_pieces | ||||
|         nP=maxout_pieces, | ||||
|     ) | ||||
|     if use_upper: | ||||
|         with use_ops("numpy"): | ||||
|  |  | |||
|  | @ -1,9 +1,8 @@ | |||
| import functools | ||||
| from typing import List, Tuple, Dict, Optional | ||||
| from thinc.api import Ops, Model, Linear, Softmax, with_array, softmax_activation, padded2list | ||||
| from typing import List | ||||
| from thinc.api import Model, Linear, with_array, softmax_activation, padded2list | ||||
| from thinc.api import chain, list2padded, configure_normal_init | ||||
| from thinc.api import Dropout | ||||
| from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d | ||||
| from thinc.types import Floats2d | ||||
| 
 | ||||
| from ...tokens import Doc | ||||
| from .._biluo import BILUO | ||||
|  | @ -12,12 +11,12 @@ from ...util import registry | |||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.BiluoTagger.v1") | ||||
| def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: | ||||
| def BiluoTagger( | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]] | ||||
| ) -> Model[List[Doc], List[Floats2d]]: | ||||
|     biluo = BILUO() | ||||
|     linear = Linear( | ||||
|         nO=None, | ||||
|         nI=tok2vec.get_dim("nO"), | ||||
|         init_W=configure_normal_init(mean=0.02) | ||||
|         nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02) | ||||
|     ) | ||||
|     model = chain( | ||||
|         tok2vec, | ||||
|  | @ -25,7 +24,7 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L | |||
|         with_array(chain(Dropout(0.1), linear)), | ||||
|         biluo, | ||||
|         with_array(softmax_activation()), | ||||
|         padded2list() | ||||
|         padded2list(), | ||||
|     ) | ||||
| 
 | ||||
|     return Model( | ||||
|  | @ -35,11 +34,14 @@ def BiluoTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], L | |||
|         layers=[model, linear], | ||||
|         refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, | ||||
|         dims={"nO": None}, | ||||
|         attrs={"get_num_actions": biluo.attrs["get_num_actions"]} | ||||
|         attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.IOBTagger.v1") | ||||
| def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], List[Floats2d]]: | ||||
| def IOBTagger( | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]] | ||||
| ) -> Model[List[Doc], List[Floats2d]]: | ||||
|     biluo = IOB() | ||||
|     linear = Linear(nO=None, nI=tok2vec.get_dim("nO")) | ||||
|     model = chain( | ||||
|  | @ -48,7 +50,7 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis | |||
|         with_array(linear), | ||||
|         biluo, | ||||
|         with_array(softmax_activation()), | ||||
|         padded2list() | ||||
|         padded2list(), | ||||
|     ) | ||||
| 
 | ||||
|     return Model( | ||||
|  | @ -58,11 +60,10 @@ def IOBTagger(tok2vec: Model[List[Doc], List[Floats2d]]) -> Model[List[Doc], Lis | |||
|         layers=[model], | ||||
|         refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo}, | ||||
|         dims={"nO": None}, | ||||
|         attrs={"get_num_actions": biluo.attrs["get_num_actions"]} | ||||
|         attrs={"get_num_actions": biluo.attrs["get_num_actions"]}, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None: | ||||
|     if model.get_dim("nO") is None and Y: | ||||
|         model.set_dim("nO", Y[0].shape[1]) | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout | ||||
| from thinc.api import glorot_uniform_init | ||||
| from thinc.api import zero_init, with_array, Softmax, chain, Model | ||||
| 
 | ||||
| from ...util import registry | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,11 +1,12 @@ | |||
| from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic, ParametricAttention | ||||
| from thinc.api import chain, concatenate, clone, Dropout | ||||
| from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum, Relu, residual, expand_window | ||||
| from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued, FeatureExtractor | ||||
| from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic | ||||
| from thinc.api import ParametricAttention, chain, concatenate, clone, Dropout | ||||
| from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout | ||||
| from thinc.api import reduce_sum, Relu, residual, expand_window, HashEmbed | ||||
| from thinc.api import with_ragged, with_array, with_cpu, uniqued, FeatureExtractor | ||||
| 
 | ||||
| from ..spacy_vectors import SpacyVectors | ||||
| from ... import util | ||||
| from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER | ||||
| from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER | ||||
| from ...util import registry | ||||
| from ..extract_ngrams import extract_ngrams | ||||
| 
 | ||||
|  | @ -50,14 +51,31 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO | |||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.TextCat.v1") | ||||
| def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, | ||||
|                           window_size, conv_depth, dropout, nO=None): | ||||
| def build_text_classifier( | ||||
|     width, | ||||
|     embed_size, | ||||
|     pretrained_vectors, | ||||
|     exclusive_classes, | ||||
|     ngram_size, | ||||
|     window_size, | ||||
|     conv_depth, | ||||
|     dropout, | ||||
|     nO=None, | ||||
| ): | ||||
|     cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): | ||||
|         lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) | ||||
|         prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) | ||||
|         suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) | ||||
|         shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) | ||||
|         lower = HashEmbed( | ||||
|             nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout | ||||
|         ) | ||||
|         prefix = HashEmbed( | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout | ||||
|         ) | ||||
|         suffix = HashEmbed( | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout | ||||
|         ) | ||||
|         shape = HashEmbed( | ||||
|             nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout | ||||
|         ) | ||||
| 
 | ||||
|         width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) | ||||
|         trained_vectors = FeatureExtractor(cols) >> with_array( | ||||
|  | @ -83,30 +101,38 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class | |||
|             vectors_width = width | ||||
|         tok2vec = vector_layer >> with_array( | ||||
|             Maxout(width, vectors_width, normalize=True) | ||||
|             >> residual((expand_window(window_size=window_size) | ||||
|                          >> Maxout(nO=width, nI=width * ((window_size * 2) + 1), normalize=True))) ** conv_depth, | ||||
|             >> residual( | ||||
|                 ( | ||||
|                     expand_window(window_size=window_size) | ||||
|                     >> Maxout( | ||||
|                         nO=width, nI=width * ((window_size * 2) + 1), normalize=True | ||||
|                     ) | ||||
|                 ) | ||||
|             ) | ||||
|             ** conv_depth, | ||||
|             pad=conv_depth, | ||||
|         ) | ||||
|         cnn_model = ( | ||||
|                 tok2vec | ||||
|                 >> list2ragged() | ||||
|                 >> ParametricAttention(width) | ||||
|                 >> reduce_sum() | ||||
|                 >> residual(Maxout(nO=width, nI=width)) | ||||
|                 >> Linear(nO=nO, nI=width) | ||||
|                 >> Dropout(0.0) | ||||
|             tok2vec | ||||
|             >> list2ragged() | ||||
|             >> ParametricAttention(width) | ||||
|             >> reduce_sum() | ||||
|             >> residual(Maxout(nO=width, nI=width)) | ||||
|             >> Linear(nO=nO, nI=width) | ||||
|             >> Dropout(0.0) | ||||
|         ) | ||||
| 
 | ||||
|         linear_model = build_bow_text_classifier( | ||||
|             nO=nO, ngram_size=ngram_size, exclusive_classes=exclusive_classes, no_output_layer=False | ||||
|             nO=nO, | ||||
|             ngram_size=ngram_size, | ||||
|             exclusive_classes=exclusive_classes, | ||||
|             no_output_layer=False, | ||||
|         ) | ||||
|         nO_double = nO*2 if nO else None | ||||
|         nO_double = nO * 2 if nO else None | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=nO_double) | ||||
|         else: | ||||
|             output_layer = ( | ||||
|                     Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() | ||||
|             ) | ||||
|             output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() | ||||
|         model = (linear_model | cnn_model) >> output_layer | ||||
|         model.set_ref("tok2vec", tok2vec) | ||||
|     if model.has_dim("nO") is not False: | ||||
|  |  | |||
|  | @ -99,7 +99,13 @@ def hash_charembed_cnn( | |||
| 
 | ||||
| @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") | ||||
| def hash_embed_bilstm_v1( | ||||
|     pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout | ||||
|     pretrained_vectors, | ||||
|     width, | ||||
|     depth, | ||||
|     embed_size, | ||||
|     subword_features, | ||||
|     maxout_pieces, | ||||
|     dropout, | ||||
| ): | ||||
|     # Does not use character embeddings: set to False by default | ||||
|     return build_Tok2Vec_model( | ||||
|  | @ -141,21 +147,24 @@ def hash_char_embed_bilstm_v1( | |||
| 
 | ||||
| @registry.architectures.register("spacy.LayerNormalizedMaxout.v1") | ||||
| def LayerNormalizedMaxout(width, maxout_pieces): | ||||
|     return Maxout( | ||||
|         nO=width, | ||||
|         nP=maxout_pieces, | ||||
|         dropout=0.0, | ||||
|         normalize=True, | ||||
|     ) | ||||
|     return Maxout(nO=width, nP=maxout_pieces, dropout=0.0, normalize=True,) | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.MultiHashEmbed.v1") | ||||
| def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): | ||||
| def MultiHashEmbed( | ||||
|     columns, width, rows, use_subwords, pretrained_vectors, mix, dropout | ||||
| ): | ||||
|     norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) | ||||
|     if use_subwords: | ||||
|         prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) | ||||
|         suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) | ||||
|         shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) | ||||
|         prefix = HashEmbed( | ||||
|             nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout | ||||
|         ) | ||||
|         suffix = HashEmbed( | ||||
|             nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout | ||||
|         ) | ||||
|         shape = HashEmbed( | ||||
|             nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout | ||||
|         ) | ||||
| 
 | ||||
|     if pretrained_vectors: | ||||
|         glove = StaticVectors( | ||||
|  | @ -195,7 +204,13 @@ def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): | |||
| def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth): | ||||
|     cnn = chain( | ||||
|         expand_window(window_size=window_size), | ||||
|         Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True), | ||||
|         Maxout( | ||||
|             nO=width, | ||||
|             nI=width * ((window_size * 2) + 1), | ||||
|             nP=maxout_pieces, | ||||
|             dropout=0.0, | ||||
|             normalize=True, | ||||
|         ), | ||||
|     ) | ||||
|     model = clone(residual(cnn), depth) | ||||
|     model.set_dim("nO", width) | ||||
|  | @ -247,11 +262,19 @@ def build_Tok2Vec_model( | |||
|         subword_features = False | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): | ||||
|         norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) | ||||
|         norm = HashEmbed( | ||||
|             nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout | ||||
|         ) | ||||
|         if subword_features: | ||||
|             prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) | ||||
|             suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) | ||||
|             shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) | ||||
|             prefix = HashEmbed( | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout | ||||
|             ) | ||||
|             suffix = HashEmbed( | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout | ||||
|             ) | ||||
|             shape = HashEmbed( | ||||
|                 nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout | ||||
|             ) | ||||
|         else: | ||||
|             prefix, suffix, shape = (None, None, None) | ||||
|         if pretrained_vectors is not None: | ||||
|  |  | |||
|  | @ -20,8 +20,8 @@ def TransitionModel(tok2vec, lower, upper, unseen_classes=set()): | |||
|         attrs={ | ||||
|             "has_upper": has_upper, | ||||
|             "unseen_classes": set(unseen_classes), | ||||
|             "resize_output": resize_output | ||||
|         } | ||||
|             "resize_output": resize_output, | ||||
|         }, | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -31,7 +31,7 @@ def forward(model, X, is_train): | |||
|         model.layers, | ||||
|         unseen_classes=model.attrs["unseen_classes"], | ||||
|         train=is_train, | ||||
|         has_upper=model.attrs["has_upper"] | ||||
|         has_upper=model.attrs["has_upper"], | ||||
|     ) | ||||
| 
 | ||||
|     return step_model, step_model.finish_steps | ||||
|  | @ -62,7 +62,7 @@ def resize_output(model, new_nO): | |||
|     nI = None | ||||
|     if smaller.has_dim("nI"): | ||||
|         nI = smaller.get_dim("nI") | ||||
|     with use_ops('numpy'): | ||||
|     with use_ops("numpy"): | ||||
|         larger = Linear(nO=new_nO, nI=nI) | ||||
|         larger.init = smaller.init | ||||
|     # it could be that the model is not initialized yet, then skip this bit | ||||
|  | @ -74,8 +74,8 @@ def resize_output(model, new_nO): | |||
|         # Weights are stored in (nr_out, nr_in) format, so we're basically | ||||
|         # just adding rows here. | ||||
|         if smaller.has_dim("nO"): | ||||
|             larger_W[:smaller.get_dim("nO")] = smaller_W | ||||
|             larger_b[:smaller.get_dim("nO")] = smaller_b | ||||
|             larger_W[: smaller.get_dim("nO")] = smaller_W | ||||
|             larger_b[: smaller.get_dim("nO")] = smaller_b | ||||
|             for i in range(smaller.get_dim("nO"), new_nO): | ||||
|                 model.attrs["unseen_classes"].add(i) | ||||
| 
 | ||||
|  |  | |||
|  | @ -21,9 +21,7 @@ class SimpleNER(Pipe): | |||
|         self.model = model | ||||
|         self.cfg = {"labels": []} | ||||
|         self.loss_func = SequenceCategoricalCrossentropy( | ||||
|             names=self.get_tag_names(), | ||||
|             normalize=True, | ||||
|             missing_value=None | ||||
|             names=self.get_tag_names(), normalize=True, missing_value=None | ||||
|         ) | ||||
|         assert self.model is not None | ||||
| 
 | ||||
|  | @ -38,21 +36,21 @@ class SimpleNER(Pipe): | |||
|     def add_label(self, label): | ||||
|         if label not in self.cfg["labels"]: | ||||
|             self.cfg["labels"].append(label) | ||||
|   | ||||
| 
 | ||||
|     def get_tag_names(self): | ||||
|         if self.is_biluo: | ||||
|             return ( | ||||
|                 [f"B-{label}" for label in self.labels] + | ||||
|                 [f"I-{label}" for label in self.labels] + | ||||
|                 [f"L-{label}" for label in self.labels] + | ||||
|                 [f"U-{label}" for label in self.labels] + | ||||
|                 ["O"] | ||||
|                 [f"B-{label}" for label in self.labels] | ||||
|                 + [f"I-{label}" for label in self.labels] | ||||
|                 + [f"L-{label}" for label in self.labels] | ||||
|                 + [f"U-{label}" for label in self.labels] | ||||
|                 + ["O"] | ||||
|             ) | ||||
|         else: | ||||
|             return ( | ||||
|                 [f"B-{label}" for label in self.labels] + | ||||
|                 [f"I-{label}" for label in self.labels] + | ||||
|                 ["O"] | ||||
|                 [f"B-{label}" for label in self.labels] | ||||
|                 + [f"I-{label}" for label in self.labels] | ||||
|                 + ["O"] | ||||
|             ) | ||||
| 
 | ||||
|     def predict(self, docs: List[Doc]) -> List[Floats2d]: | ||||
|  | @ -108,7 +106,7 @@ class SimpleNER(Pipe): | |||
| 
 | ||||
|     def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): | ||||
|         self.cfg.update(kwargs) | ||||
|         if not hasattr(get_examples, '__call__'): | ||||
|         if not hasattr(get_examples, "__call__"): | ||||
|             gold_tuples = get_examples | ||||
|             get_examples = lambda: gold_tuples | ||||
|         labels = _get_labels(get_examples()) | ||||
|  | @ -117,14 +115,12 @@ class SimpleNER(Pipe): | |||
|         labels = self.labels | ||||
|         n_actions = self.model.attrs["get_num_actions"](len(labels)) | ||||
|         self.model.set_dim("nO", n_actions) | ||||
|         self.model.initialize()  | ||||
|         self.model.initialize() | ||||
|         if pipeline is not None: | ||||
|             self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) | ||||
|         link_vectors_to_models(self.vocab) | ||||
|         self.loss_func = SequenceCategoricalCrossentropy( | ||||
|             names=self.get_tag_names(), | ||||
|             normalize=True, | ||||
|             missing_value=None | ||||
|             names=self.get_tag_names(), normalize=True, missing_value=None | ||||
|         ) | ||||
| 
 | ||||
|         return sgd | ||||
|  | @ -135,7 +131,7 @@ class SimpleNER(Pipe): | |||
| 
 | ||||
| def _has_ner(eg): | ||||
|     for ner_tag in eg.gold.ner: | ||||
|         if ner_tag != "-" and ner_tag != None: | ||||
|         if ner_tag != "-" and ner_tag is not None: | ||||
|             return True | ||||
|     else: | ||||
|         return False | ||||
|  | @ -145,7 +141,7 @@ def _get_labels(examples): | |||
|     labels = set() | ||||
|     for eg in examples: | ||||
|         for ner_tag in eg.token_annotation.entities: | ||||
|             if ner_tag != 'O' and ner_tag != '-': | ||||
|                 _, label = ner_tag.split('-', 1) | ||||
|             if ner_tag != "O" and ner_tag != "-": | ||||
|                 _, label = ner_tag.split("-", 1) | ||||
|                 labels.add(label) | ||||
|     return list(sorted(labels)) | ||||
|  |  | |||
|  | @ -98,7 +98,9 @@ class Scorer(object): | |||
|             for name, component in pipeline: | ||||
|                 if name == "textcat": | ||||
|                     self.textcat_multilabel = component.model.attrs["multi_label"] | ||||
|                     self.textcat_positive_label = component.cfg.get("positive_label", None) | ||||
|                     self.textcat_positive_label = component.cfg.get( | ||||
|                         "positive_label", None | ||||
|                     ) | ||||
|                     for label in component.cfg.get("labels", []): | ||||
|                         self.textcat_auc_per_cat[label] = ROCAUCScore() | ||||
|                         self.textcat_f_per_cat[label] = PRFScore() | ||||
|  | @ -119,19 +121,19 @@ class Scorer(object): | |||
| 
 | ||||
|     @property | ||||
|     def morphs_acc(self): | ||||
|        """RETURNS (float): Morph tag accuracy (morphological features, | ||||
|         """RETURNS (float): Morph tag accuracy (morphological features, | ||||
|            i.e. `Token.morph`). | ||||
|        """ | ||||
|        return self.morphs.fscore * 100 | ||||
|         return self.morphs.fscore * 100 | ||||
| 
 | ||||
|     @property | ||||
|     def morphs_per_type(self): | ||||
|        """RETURNS (dict): Scores per dependency label. | ||||
|         """RETURNS (dict): Scores per dependency label. | ||||
|        """ | ||||
|        return { | ||||
|            k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} | ||||
|            for k, v in self.morphs_per_feat.items() | ||||
|        } | ||||
|         return { | ||||
|             k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} | ||||
|             for k, v in self.morphs_per_feat.items() | ||||
|         } | ||||
| 
 | ||||
|     @property | ||||
|     def sent_p(self): | ||||
|  | @ -302,7 +304,15 @@ class Scorer(object): | |||
|         gold_morphs_per_feat = {} | ||||
|         gold_sent_starts = set() | ||||
|         gold_ents = set(tags_to_entities(orig.entities)) | ||||
|         for id_, tag, pos, morph, head, dep, sent_start in zip(orig.ids, orig.tags, orig.pos, orig.morphs, orig.heads, orig.deps, orig.sent_starts): | ||||
|         for id_, tag, pos, morph, head, dep, sent_start in zip( | ||||
|             orig.ids, | ||||
|             orig.tags, | ||||
|             orig.pos, | ||||
|             orig.morphs, | ||||
|             orig.heads, | ||||
|             orig.deps, | ||||
|             orig.sent_starts, | ||||
|         ): | ||||
|             gold_tags.add((id_, tag)) | ||||
|             gold_pos.add((id_, pos)) | ||||
|             gold_morphs.add((id_, morph)) | ||||
|  | @ -400,7 +410,10 @@ class Scorer(object): | |||
|         self.pos.score_set(cand_pos, gold_pos) | ||||
|         self.morphs.score_set(cand_morphs, gold_morphs) | ||||
|         for field in self.morphs_per_feat: | ||||
|             self.morphs_per_feat[field].score_set(cand_morphs_per_feat.get(field, set()), gold_morphs_per_feat.get(field, set())) | ||||
|             self.morphs_per_feat[field].score_set( | ||||
|                 cand_morphs_per_feat.get(field, set()), | ||||
|                 gold_morphs_per_feat.get(field, set()), | ||||
|             ) | ||||
|         self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) | ||||
|         self.labelled.score_set(cand_deps, gold_deps) | ||||
|         for dep in self.labelled_per_dep: | ||||
|  | @ -412,7 +425,9 @@ class Scorer(object): | |||
|         ) | ||||
|         if ( | ||||
|             len(gold.cats) > 0 | ||||
|             and set(self.textcat_f_per_cat) == set(self.textcat_auc_per_cat) == set(gold.cats) | ||||
|             and set(self.textcat_f_per_cat) | ||||
|             == set(self.textcat_auc_per_cat) | ||||
|             == set(gold.cats) | ||||
|             and set(gold.cats) == set(doc.cats) | ||||
|         ): | ||||
|             goldcat = max(gold.cats, key=gold.cats.get) | ||||
|  | @ -424,10 +439,10 @@ class Scorer(object): | |||
|                 ) | ||||
|             for label in set(gold.cats): | ||||
|                 self.textcat_auc_per_cat[label].score_set( | ||||
|                         doc.cats[label], gold.cats[label] | ||||
|                     doc.cats[label], gold.cats[label] | ||||
|                 ) | ||||
|                 self.textcat_f_per_cat[label].score_set( | ||||
|                         set([label]) & set([candcat]), set([label]) & set([goldcat]) | ||||
|                     set([label]) & set([candcat]), set([label]) & set([goldcat]) | ||||
|                 ) | ||||
|         elif len(self.textcat_f_per_cat) > 0: | ||||
|             model_labels = set(self.textcat_f_per_cat) | ||||
|  |  | |||
|  | @ -9,7 +9,12 @@ from spacy.pipeline.defaults import default_ner | |||
| def test_doc_add_entities_set_ents_iob(en_vocab): | ||||
|     text = ["This", "is", "a", "lion"] | ||||
|     doc = get_doc(en_vocab, text) | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(en_vocab, default_ner(), **config) | ||||
|     ner.begin_training([]) | ||||
|     ner(doc) | ||||
|  | @ -26,7 +31,12 @@ def test_doc_add_entities_set_ents_iob(en_vocab): | |||
| def test_ents_reset(en_vocab): | ||||
|     text = ["This", "is", "a", "lion"] | ||||
|     doc = get_doc(en_vocab, text) | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(en_vocab, default_ner(), **config) | ||||
|     ner.begin_training([]) | ||||
|     ner(doc) | ||||
|  |  | |||
|  | @ -1,9 +1,8 @@ | |||
| import pytest | ||||
| from thinc.api import Adam, NumpyOps | ||||
| from thinc.api import Adam | ||||
| from spacy.attrs import NORM | ||||
| from spacy.gold import GoldParse | ||||
| from spacy.vocab import Vocab | ||||
| 
 | ||||
| from spacy.pipeline.defaults import default_parser, default_ner | ||||
| from spacy.tokens import Doc | ||||
| from spacy.pipeline import DependencyParser, EntityRecognizer | ||||
|  | @ -17,7 +16,12 @@ def vocab(): | |||
| 
 | ||||
| @pytest.fixture | ||||
| def parser(vocab): | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(vocab, default_parser(), **config) | ||||
|     return parser | ||||
| 
 | ||||
|  | @ -58,7 +62,12 @@ def test_add_label(parser): | |||
| 
 | ||||
| 
 | ||||
| def test_add_label_deserializes_correctly(): | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner1 = EntityRecognizer(Vocab(), default_ner(), **config) | ||||
|     ner1.add_label("C") | ||||
|     ner1.add_label("B") | ||||
|  |  | |||
|  | @ -138,7 +138,12 @@ def test_get_oracle_actions(): | |||
|         deps.append(dep) | ||||
|         ents.append(ent) | ||||
|     doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(doc.vocab, default_parser(), **config) | ||||
|     parser.moves.add_action(0, "") | ||||
|     parser.moves.add_action(1, "") | ||||
|  |  | |||
|  | @ -138,7 +138,12 @@ def test_accept_blocked_token(): | |||
|     # 1. test normal behaviour | ||||
|     nlp1 = English() | ||||
|     doc1 = nlp1("I live in New York") | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner1 = EntityRecognizer(doc1.vocab, default_ner(), **config) | ||||
|     assert [token.ent_iob_ for token in doc1] == ["", "", "", "", ""] | ||||
|     assert [token.ent_type_ for token in doc1] == ["", "", "", "", ""] | ||||
|  | @ -157,7 +162,12 @@ def test_accept_blocked_token(): | |||
|     # 2. test blocking behaviour | ||||
|     nlp2 = English() | ||||
|     doc2 = nlp2("I live in New York") | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner2 = EntityRecognizer(doc2.vocab, default_ner(), **config) | ||||
| 
 | ||||
|     # set "New York" to a blocked entity | ||||
|  | @ -215,7 +225,12 @@ def test_overwrite_token(): | |||
|     assert [token.ent_type_ for token in doc] == ["", "", "", "", ""] | ||||
| 
 | ||||
|     # Check that a new ner can overwrite O | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner2 = EntityRecognizer(doc.vocab, default_ner(), **config) | ||||
|     ner2.moves.add_action(5, "") | ||||
|     ner2.add_label("GPE") | ||||
|  |  | |||
|  | @ -28,7 +28,12 @@ def tok2vec(): | |||
| 
 | ||||
| @pytest.fixture | ||||
| def parser(vocab, arc_eager): | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     return Parser(vocab, model=default_parser(), moves=arc_eager, **config) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -94,7 +94,12 @@ def test_beam_advance_too_few_scores(beam, scores): | |||
| 
 | ||||
| def test_beam_parse(): | ||||
|     nlp = Language() | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     nlp.add_pipe(DependencyParser(nlp.vocab, default_parser(), **config), name="parser") | ||||
|     nlp.parser.add_label("nsubj") | ||||
|     nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) | ||||
|  |  | |||
|  | @ -16,7 +16,12 @@ def vocab(): | |||
| 
 | ||||
| @pytest.fixture | ||||
| def parser(vocab): | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(vocab, default_parser(), **config) | ||||
|     parser.cfg["token_vector_width"] = 4 | ||||
|     parser.cfg["hidden_width"] = 32 | ||||
|  |  | |||
|  | @ -264,11 +264,13 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] | |||
| def test_overfitting_IO(): | ||||
|     # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly | ||||
|     nlp = English() | ||||
|     nlp.add_pipe(nlp.create_pipe('sentencizer')) | ||||
|     nlp.add_pipe(nlp.create_pipe("sentencizer")) | ||||
| 
 | ||||
|     # Add a custom component to recognize "Russ Cochran" as an entity for the example training data | ||||
|     ruler = EntityRuler(nlp) | ||||
|     patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] | ||||
|     patterns = [ | ||||
|         {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} | ||||
|     ] | ||||
|     ruler.add_patterns(patterns) | ||||
|     nlp.add_pipe(ruler) | ||||
| 
 | ||||
|  | @ -285,7 +287,11 @@ def test_overfitting_IO(): | |||
|     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) | ||||
|     mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) | ||||
|     mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) | ||||
|     mykb.add_alias(alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5]) | ||||
|     mykb.add_alias( | ||||
|         alias="Russ Cochran", | ||||
|         entities=["Q2146908", "Q7381115"], | ||||
|         probabilities=[0.5, 0.5], | ||||
|     ) | ||||
| 
 | ||||
|     # Create the Entity Linker component and add it to the pipeline | ||||
|     entity_linker = nlp.create_pipe("entity_linker", config={"kb": mykb}) | ||||
|  |  | |||
|  | @ -15,8 +15,17 @@ def test_label_types(): | |||
| 
 | ||||
| 
 | ||||
| TRAIN_DATA = [ | ||||
|     ("I like green eggs", {"morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"]}), | ||||
|     ("Eat blue ham", {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}), | ||||
|     ( | ||||
|         "I like green eggs", | ||||
|         { | ||||
|             "morphs": ["Feat=N", "Feat=V", "Feat=J", "Feat=N"], | ||||
|             "pos": ["NOUN", "VERB", "ADJ", "NOUN"], | ||||
|         }, | ||||
|     ), | ||||
|     ( | ||||
|         "Eat blue ham", | ||||
|         {"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, | ||||
|     ), | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
|  | @ -38,7 +47,12 @@ def test_overfitting_IO(): | |||
|     # test the trained model | ||||
|     test_text = "I like blue eggs" | ||||
|     doc = nlp(test_text) | ||||
|     gold_morphs = ["Feat=N|POS=NOUN", "Feat=V|POS=VERB", "Feat=J|POS=ADJ", "Feat=N|POS=NOUN"] | ||||
|     gold_morphs = [ | ||||
|         "Feat=N|POS=NOUN", | ||||
|         "Feat=V|POS=VERB", | ||||
|         "Feat=J|POS=ADJ", | ||||
|         "Feat=N|POS=NOUN", | ||||
|     ] | ||||
|     assert gold_morphs == [t.morph_ for t in doc] | ||||
| 
 | ||||
|     # Also test the results are still the same after IO | ||||
|  |  | |||
|  | @ -1,30 +1,31 @@ | |||
| import pytest | ||||
| from collections import namedtuple | ||||
| 
 | ||||
| from thinc.api import NumpyOps | ||||
| from spacy.ml._biluo import BILUO, _get_transition_table | ||||
| from spacy.pipeline.simple_ner import SimpleNER | ||||
| import spacy | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture(params=[ | ||||
|     ["PER", "ORG", "LOC", "MISC"], | ||||
|     ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"] | ||||
| ]) | ||||
| @pytest.fixture( | ||||
|     params=[ | ||||
|         ["PER", "ORG", "LOC", "MISC"], | ||||
|         ["GPE", "PERSON", "NUMBER", "CURRENCY", "EVENT"], | ||||
|     ] | ||||
| ) | ||||
| def labels(request): | ||||
|     return request.param | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def ops(): | ||||
|     return NumpyOps() | ||||
| 
 | ||||
| 
 | ||||
| def _get_actions(labels): | ||||
|     action_names = ( | ||||
|         [f"B{label}" for label in labels] + \ | ||||
|         [f"I{label}" for label in labels] + \ | ||||
|         [f"L{label}" for label in labels] + \ | ||||
|         [f"U{label}" for label in labels] + \ | ||||
|         ["O"] | ||||
|         [f"B{label}" for label in labels] | ||||
|         + [f"I{label}" for label in labels] | ||||
|         + [f"L{label}" for label in labels] | ||||
|         + [f"U{label}" for label in labels] | ||||
|         + ["O"] | ||||
|     ) | ||||
|     A = namedtuple("actions", action_names) | ||||
|     return A(**{name: i for i, name in enumerate(action_names)}) | ||||
|  | @ -228,7 +229,7 @@ def test_transition_table(ops): | |||
|     assert table[0, a.O, a.Uloc] == 1 | ||||
|     assert table[0, a.O, a.Uorg] == 1 | ||||
|     assert table[0, a.O, a.O] == 1 | ||||
|      | ||||
| 
 | ||||
|     # Last token, prev action was B | ||||
|     assert table[1, a.Bper, a.Bper] == 0 | ||||
|     assert table[1, a.Bper, a.Bloc] == 0 | ||||
|  |  | |||
|  | @ -270,7 +270,12 @@ def test_issue1963(en_tokenizer): | |||
| 
 | ||||
| @pytest.mark.parametrize("label", ["U-JOB-NAME"]) | ||||
| def test_issue1967(label): | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(Vocab(), default_ner(), **config) | ||||
|     example = Example(doc=None) | ||||
|     example.set_token_annotation( | ||||
|  |  | |||
|  | @ -196,7 +196,12 @@ def test_issue3345(): | |||
|     doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) | ||||
|     doc[4].is_sent_start = True | ||||
|     ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(doc.vocab, default_ner(), **config) | ||||
|     # Add the OUT action. I wouldn't have thought this would be necessary... | ||||
|     ner.moves.add_action(5, "") | ||||
|  |  | |||
|  | @ -6,7 +6,12 @@ from spacy.pipeline.defaults import default_parser | |||
| 
 | ||||
| def test_issue3830_no_subtok(): | ||||
|     """Test that the parser doesn't have subtok label if not learn_tokens""" | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(Vocab(), default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     assert "subtok" not in parser.labels | ||||
|  | @ -16,7 +21,12 @@ def test_issue3830_no_subtok(): | |||
| 
 | ||||
| def test_issue3830_with_subtok(): | ||||
|     """Test that the parser does have subtok label if learn_tokens=True.""" | ||||
|     config = {"learn_tokens": True, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": True, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(Vocab(), default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     assert "subtok" not in parser.labels | ||||
|  |  | |||
|  | @ -74,7 +74,12 @@ def test_issue4042_bug2(): | |||
|             output_dir.mkdir() | ||||
|         ner1.to_disk(output_dir) | ||||
| 
 | ||||
|         config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|         config = { | ||||
|             "learn_tokens": False, | ||||
|             "min_action_freq": 30, | ||||
|             "beam_width": 1, | ||||
|             "beam_update_prob": 1.0, | ||||
|         } | ||||
|         ner2 = EntityRecognizer(vocab, default_ner(), **config) | ||||
|         ner2.from_disk(output_dir) | ||||
|         assert len(ner2.labels) == 2 | ||||
|  |  | |||
|  | @ -12,7 +12,12 @@ def test_issue4313(): | |||
|     beam_width = 16 | ||||
|     beam_density = 0.0001 | ||||
|     nlp = English() | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     ner = EntityRecognizer(nlp.vocab, default_ner(), **config) | ||||
|     ner.add_label("SOME_LABEL") | ||||
|     ner.begin_training([]) | ||||
|  |  | |||
|  | @ -1,4 +1,3 @@ | |||
| import pytest | ||||
| from spacy.language import Language | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -12,7 +12,12 @@ test_parsers = [DependencyParser, EntityRecognizer] | |||
| 
 | ||||
| @pytest.fixture | ||||
| def parser(en_vocab): | ||||
|     config = {"learn_tokens": False, "min_action_freq": 30, "beam_width":  1, "beam_update_prob": 1.0} | ||||
|     config = { | ||||
|         "learn_tokens": False, | ||||
|         "min_action_freq": 30, | ||||
|         "beam_width": 1, | ||||
|         "beam_update_prob": 1.0, | ||||
|     } | ||||
|     parser = DependencyParser(en_vocab, default_parser(), **config) | ||||
|     parser.add_label("nsubj") | ||||
|     return parser | ||||
|  |  | |||
|  | @ -35,8 +35,10 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): | |||
|     assert vocab1.to_bytes() == vocab1_b | ||||
|     new_vocab1 = Vocab().from_bytes(vocab1_b) | ||||
|     assert new_vocab1.to_bytes() == vocab1_b | ||||
|     assert len(new_vocab1.strings) == len(strings1) + 2 # adds _SP and POS=SPACE | ||||
|     assert sorted([s for s in new_vocab1.strings]) == sorted(strings1 + list(default_strings)) | ||||
|     assert len(new_vocab1.strings) == len(strings1) + 2  # adds _SP and POS=SPACE | ||||
|     assert sorted([s for s in new_vocab1.strings]) == sorted( | ||||
|         strings1 + list(default_strings) | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize("strings1,strings2", test_strings) | ||||
|  |  | |||
|  | @ -40,6 +40,7 @@ test_ner_apple = [ | |||
|     ] | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def tagged_doc(): | ||||
|     text = "Sarah's sister flew to Silicon Valley via London." | ||||
|  | @ -184,7 +185,7 @@ def test_tag_score(tagged_doc): | |||
|         tagged_doc, | ||||
|         tags=[t.tag_ for t in tagged_doc], | ||||
|         pos=[t.pos_ for t in tagged_doc], | ||||
|         morphs=[t.morph_ for t in tagged_doc] | ||||
|         morphs=[t.morph_ for t in tagged_doc], | ||||
|     ) | ||||
|     scorer.score((tagged_doc, gold)) | ||||
|     results = scorer.scores | ||||
|  |  | |||
|  | @ -13,7 +13,7 @@ from spacy.util import minibatch_by_words | |||
|         ([400, 400, 199, 3], [4]), | ||||
|         ([400, 400, 199, 3, 200], [3, 2]), | ||||
|         ([400, 400, 199, 3, 1], [5]), | ||||
|         ([400, 400, 199, 3, 1, 1500], [5]),    # 1500 will be discarded | ||||
|         ([400, 400, 199, 3, 1, 1500], [5]),  # 1500 will be discarded | ||||
|         ([400, 400, 199, 3, 1, 200], [3, 3]), | ||||
|         ([400, 400, 199, 3, 1, 999], [3, 3]), | ||||
|         ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]), | ||||
|  | @ -28,7 +28,11 @@ def test_util_minibatch(doc_sizes, expected_batches): | |||
|     examples = [Example(doc=doc) for doc in docs] | ||||
|     tol = 0.2 | ||||
|     batch_size = 1000 | ||||
|     batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True)) | ||||
|     batches = list( | ||||
|         minibatch_by_words( | ||||
|             examples=examples, size=batch_size, tolerance=tol, discard_oversize=True | ||||
|         ) | ||||
|     ) | ||||
|     assert [len(batch) for batch in batches] == expected_batches | ||||
| 
 | ||||
|     max_size = batch_size + batch_size * tol | ||||
|  | @ -53,7 +57,9 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): | |||
|     examples = [Example(doc=doc) for doc in docs] | ||||
|     tol = 0.2 | ||||
|     batch_size = 1000 | ||||
|     batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False)) | ||||
|     batches = list( | ||||
|         minibatch_by_words( | ||||
|             examples=examples, size=batch_size, tolerance=tol, discard_oversize=False | ||||
|         ) | ||||
|     ) | ||||
|     assert [len(batch) for batch in batches] == expected_batches | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -697,7 +697,9 @@ def decaying(start, stop, decay): | |||
|         curr -= decay | ||||
| 
 | ||||
| 
 | ||||
| def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False): | ||||
| def minibatch_by_words( | ||||
|     examples, size, count_words=len, tolerance=0.2, discard_oversize=False | ||||
| ): | ||||
|     """Create minibatches of roughly a given number of words. If any examples | ||||
|     are longer than the specified batch length, they will appear in a batch by | ||||
|     themselves, or be discarded if discard_oversize=True.""" | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user