mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						1075b7ebb7
					
				|  | @ -8,7 +8,6 @@ requires = [ | |||
|     "murmurhash>=0.28.0,<1.1.0", | ||||
|     "thinc>=8.0.0rc0,<8.1.0", | ||||
|     "blis>=0.4.0,<0.8.0", | ||||
|     "pytokenizations", | ||||
|     "pathy" | ||||
| ] | ||||
| build-backend = "setuptools.build_meta" | ||||
|  |  | |||
|  | @ -14,8 +14,7 @@ pathy | |||
| numpy>=1.15.0 | ||||
| requests>=2.13.0,<3.0.0 | ||||
| tqdm>=4.38.0,<5.0.0 | ||||
| pydantic>=1.5.0,<2.0.0 | ||||
| pytokenizations | ||||
| pydantic>=1.5.0,<1.7.0 | ||||
| # Official Python utilities | ||||
| setuptools | ||||
| packaging>=20.0 | ||||
|  |  | |||
|  | @ -51,8 +51,8 @@ install_requires = | |||
|     tqdm>=4.38.0,<5.0.0 | ||||
|     numpy>=1.15.0 | ||||
|     requests>=2.13.0,<3.0.0 | ||||
|     pydantic>=1.5.0,<2.0.0 | ||||
|     pytokenizations | ||||
|     pydantic>=1.5.0,<1.7.0 | ||||
|     jinja2 | ||||
|     # Official Python utilities | ||||
|     setuptools | ||||
|     packaging>=20.0 | ||||
|  |  | |||
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -49,6 +49,7 @@ MOD_NAMES = [ | |||
|     "spacy.pipeline._parser_internals.stateclass", | ||||
|     "spacy.pipeline._parser_internals.transition_system", | ||||
|     "spacy.tokenizer", | ||||
|     "spacy.training.align", | ||||
|     "spacy.training.gold_io", | ||||
|     "spacy.tokens.doc", | ||||
|     "spacy.tokens.span", | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| # fmt: off | ||||
| __title__ = "spacy-nightly" | ||||
| __version__ = "3.0.0rc1" | ||||
| __version__ = "3.0.0rc2" | ||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||
| __projects__ = "https://github.com/explosion/projects" | ||||
|  |  | |||
|  | @ -93,27 +93,42 @@ def evaluate( | |||
|         "SPEED": "speed", | ||||
|     } | ||||
|     results = {} | ||||
|     data = {} | ||||
|     for metric, key in metrics.items(): | ||||
|         if key in scores: | ||||
|             if key == "cats_score": | ||||
|                 metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" | ||||
|             if key == "speed": | ||||
|                 results[metric] = f"{scores[key]:.0f}" | ||||
|             if isinstance(scores[key], (int, float)): | ||||
|                 if key == "speed": | ||||
|                     results[metric] = f"{scores[key]:.0f}" | ||||
|                 else: | ||||
|                     results[metric] = f"{scores[key]*100:.2f}" | ||||
|             else: | ||||
|                 results[metric] = f"{scores[key]*100:.2f}" | ||||
|     data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} | ||||
|                 results[metric] = "-" | ||||
|             data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] | ||||
| 
 | ||||
|     msg.table(results, title="Results") | ||||
| 
 | ||||
|     if "morph_per_feat" in scores: | ||||
|         if scores["morph_per_feat"]: | ||||
|             print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") | ||||
|             data["morph_per_feat"] = scores["morph_per_feat"] | ||||
|     if "dep_las_per_type" in scores: | ||||
|         if scores["dep_las_per_type"]: | ||||
|             print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") | ||||
|             data["dep_las_per_type"] = scores["dep_las_per_type"] | ||||
|     if "ents_per_type" in scores: | ||||
|         if scores["ents_per_type"]: | ||||
|             print_ents_per_type(msg, scores["ents_per_type"]) | ||||
|             print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") | ||||
|             data["ents_per_type"] = scores["ents_per_type"] | ||||
|     if "cats_f_per_type" in scores: | ||||
|         if scores["cats_f_per_type"]: | ||||
|             print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) | ||||
|             print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") | ||||
|             data["cats_f_per_type"] = scores["cats_f_per_type"] | ||||
|     if "cats_auc_per_type" in scores: | ||||
|         if scores["cats_auc_per_type"]: | ||||
|             print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) | ||||
|             data["cats_auc_per_type"] = scores["cats_auc_per_type"] | ||||
| 
 | ||||
|     if displacy_path: | ||||
|         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] | ||||
|  | @ -157,7 +172,7 @@ def render_parses( | |||
|             file_.write(html) | ||||
| 
 | ||||
| 
 | ||||
| def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: | ||||
| def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None: | ||||
|     data = [ | ||||
|         (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") | ||||
|         for k, v in scores.items() | ||||
|  | @ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No | |||
|         data, | ||||
|         header=("", "P", "R", "F"), | ||||
|         aligns=("l", "r", "r", "r"), | ||||
|         title="NER (per type)", | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: | ||||
|     data = [ | ||||
|         (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") | ||||
|         for k, v in scores.items() | ||||
|     ] | ||||
|     msg.table( | ||||
|         data, | ||||
|         header=("", "P", "R", "F"), | ||||
|         aligns=("l", "r", "r", "r"), | ||||
|         title="Textcat F (per label)", | ||||
|         title=f"{name} (per {type})", | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -39,7 +39,7 @@ def init_vectors_cli( | |||
|     nlp.to_disk(output_dir) | ||||
|     msg.good( | ||||
|         "Saved nlp object with vectors to output directory. You can now use the " | ||||
|         "path to it in your config as the 'vectors' setting in [initialize.vocab].", | ||||
|         "path to it in your config as the 'vectors' setting in [initialize].", | ||||
|         output_dir.resolve(), | ||||
|     ) | ||||
| 
 | ||||
|  | @ -100,7 +100,7 @@ def init_labels_cli( | |||
|     extract the labels.""" | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|     if not output_path.exists(): | ||||
|         output_path.mkdir() | ||||
|         output_path.mkdir(parents=True) | ||||
|     overrides = parse_config_overrides(ctx.args) | ||||
|     import_code(code_path) | ||||
|     setup_gpu(use_gpu) | ||||
|  |  | |||
|  | @ -136,15 +136,19 @@ factory = "textcat" | |||
| 
 | ||||
| {% if optimize == "accuracy" %} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatEnsemble.v1" | ||||
| exclusive_classes = false | ||||
| width = 64 | ||||
| conv_depth = 2 | ||||
| embed_size = 2000 | ||||
| window_size = 1 | ||||
| ngram_size = 1 | ||||
| @architectures = "spacy.TextCatEnsemble.v2" | ||||
| nO = null | ||||
| 
 | ||||
| [components.textcat.model.tok2vec] | ||||
| @architectures = "spacy-transformers.TransformerListener.v1" | ||||
| grad_factor = 1.0 | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
|  | @ -271,15 +275,19 @@ factory = "textcat" | |||
| 
 | ||||
| {% if optimize == "accuracy" %} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatEnsemble.v1" | ||||
| exclusive_classes = false | ||||
| width = 64 | ||||
| conv_depth = 2 | ||||
| embed_size = 2000 | ||||
| window_size = 1 | ||||
| ngram_size = 1 | ||||
| @architectures = "spacy.TextCatEnsemble.v2" | ||||
| nO = null | ||||
| 
 | ||||
| [components.textcat.model.tok2vec] | ||||
| @architectures = "spacy.Tok2VecListener.v1" | ||||
| width = ${components.tok2vec.model.encode.width} | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| 
 | ||||
| {% else -%} | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
|  |  | |||
|  | @ -44,7 +44,7 @@ def train_cli( | |||
|     if not config_path or not config_path.exists(): | ||||
|         msg.fail("Config file not found", config_path, exits=1) | ||||
|     if output_path is not None and not output_path.exists(): | ||||
|         output_path.mkdir() | ||||
|         output_path.mkdir(parents=True) | ||||
|         msg.good(f"Created output directory: {output_path}") | ||||
|     overrides = parse_config_overrides(ctx.args) | ||||
|     import_code(code_path) | ||||
|  |  | |||
|  | @ -398,8 +398,8 @@ class Errors: | |||
|     E163 = ("cumsum was found to be unstable: its last element does not " | ||||
|             "correspond to sum") | ||||
|     E164 = ("x is neither increasing nor decreasing: {x}.") | ||||
|     E165 = ("Only one class present in y_true. ROC AUC score is not defined in " | ||||
|             "that case.") | ||||
|     E165 = ("Only one class present in the gold labels: {label}. " | ||||
|             "ROC AUC score is not defined in that case.") | ||||
|     E166 = ("Can only merge DocBins with the same value for '{param}'.\n" | ||||
|             "Current DocBin: {current}\nOther DocBin: {other}") | ||||
|     E169 = ("Can't find module: {module}") | ||||
|  | @ -456,6 +456,8 @@ class Errors: | |||
|             "issue tracker: http://github.com/explosion/spaCy/issues") | ||||
| 
 | ||||
|     # TODO: fix numbering after merging develop into master | ||||
|     E897 = ("Field '{field}' should be a dot-notation string referring to the " | ||||
|             "relevant section in the config, but found type {type} instead.") | ||||
|     E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " | ||||
|             "is not set or None. If you've implemented a custom component, make " | ||||
|             "sure to store the component model as `self.model` in your " | ||||
|  | @ -562,7 +564,10 @@ class Errors: | |||
|             "a string value from {expected} but got: '{arg}'") | ||||
|     E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " | ||||
|             "a list, but got: {arg_type}") | ||||
|     E949 = ("Can only create an alignment when the texts are the same.") | ||||
|     E949 = ("Unable to align tokens for the predicted and reference docs. It " | ||||
|             "is only possible to align the docs when both texts are the same " | ||||
|             "except for whitespace and capitalization. The predicted tokens " | ||||
|             "start with: {x}. The reference tokens start with: {y}.") | ||||
|     E952 = ("The section '{name}' is not a valid section in the provided config.") | ||||
|     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") | ||||
|     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " | ||||
|  |  | |||
|  | @ -286,10 +286,10 @@ cdef class DependencyMatcher: | |||
|                 self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) | ||||
|                 for matched_tree in matched_trees: | ||||
|                     matched_key_trees.append((key, matched_tree)) | ||||
|             for i, (match_id, nodes) in enumerate(matched_key_trees): | ||||
|                 on_match = self._callbacks.get(match_id) | ||||
|                 if on_match is not None: | ||||
|                     on_match(self, doc, i, matched_key_trees) | ||||
|         for i, (match_id, nodes) in enumerate(matched_key_trees): | ||||
|             on_match = self._callbacks.get(match_id) | ||||
|             if on_match is not None: | ||||
|                 on_match(self, doc, i, matched_key_trees) | ||||
|         return matched_key_trees | ||||
| 
 | ||||
|     def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): | ||||
|  |  | |||
|  | @ -1,4 +1,6 @@ | |||
| from typing import Optional | ||||
| from typing import Optional, List | ||||
| 
 | ||||
| from thinc.types import Floats2d | ||||
| from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic | ||||
| from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention | ||||
| from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum | ||||
|  | @ -10,12 +12,13 @@ from ...util import registry | |||
| from ..extract_ngrams import extract_ngrams | ||||
| from ..staticvectors import StaticVectors | ||||
| from ..featureextractor import FeatureExtractor | ||||
| from ...tokens import Doc | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.TextCatCNN.v1") | ||||
| def build_simple_cnn_text_classifier( | ||||
|     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None | ||||
| ) -> Model: | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     """ | ||||
|     Build a simple CNN text classifier, given a token-to-vector model as inputs. | ||||
|     If exclusive_classes=True, a softmax non-linearity is applied, so that the | ||||
|  | @ -23,15 +26,14 @@ def build_simple_cnn_text_classifier( | |||
|     is applied instead, so that outputs are in the range [0, 1]. | ||||
|     """ | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         cnn = tok2vec >> list2ragged() >> reduce_mean() | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) | ||||
|             model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer | ||||
|             model = cnn >> output_layer | ||||
|             model.set_ref("output_layer", output_layer) | ||||
|         else: | ||||
|             linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) | ||||
|             model = ( | ||||
|                 tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() | ||||
|             ) | ||||
|             model = cnn >> linear_layer >> Logistic() | ||||
|             model.set_ref("output_layer", linear_layer) | ||||
|     model.set_ref("tok2vec", tok2vec) | ||||
|     model.set_dim("nO", nO) | ||||
|  | @ -45,8 +47,7 @@ def build_bow_text_classifier( | |||
|     ngram_size: int, | ||||
|     no_output_layer: bool, | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model: | ||||
|     # Don't document this yet, I'm not sure it's right. | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     with Model.define_operators({">>": chain}): | ||||
|         sparse_linear = SparseLinear(nO) | ||||
|         model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear | ||||
|  | @ -59,6 +60,39 @@ def build_bow_text_classifier( | |||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @registry.architectures.register("spacy.TextCatEnsemble.v2") | ||||
| def build_text_classifier( | ||||
|     tok2vec: Model[List[Doc], List[Floats2d]], | ||||
|     linear_model: Model[List[Doc], Floats2d], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     exclusive_classes = not linear_model.attrs["multi_label"] | ||||
|     with Model.define_operators({">>": chain, "|": concatenate}): | ||||
|         width = tok2vec.get_dim("nO") | ||||
|         cnn_model = ( | ||||
|                 tok2vec | ||||
|                 >> list2ragged() | ||||
|                 >> ParametricAttention(width)   # TODO: benchmark performance difference of this layer | ||||
|                 >> reduce_sum() | ||||
|                 >> residual(Maxout(nO=width, nI=width)) | ||||
|                 >> Linear(nO=nO, nI=width) | ||||
|                 >> Dropout(0.0) | ||||
|         ) | ||||
| 
 | ||||
|         nO_double = nO * 2 if nO else None | ||||
|         if exclusive_classes: | ||||
|             output_layer = Softmax(nO=nO, nI=nO_double) | ||||
|         else: | ||||
|             output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() | ||||
|         model = (linear_model | cnn_model) >> output_layer | ||||
|         model.set_ref("tok2vec", tok2vec) | ||||
|     if model.has_dim("nO") is not False: | ||||
|         model.set_dim("nO", nO) | ||||
|     model.set_ref("output_layer", linear_model.get_ref("output_layer")) | ||||
|     model.attrs["multi_label"] = not exclusive_classes | ||||
|     return model | ||||
| 
 | ||||
| # TODO: move to legacy | ||||
| @registry.architectures.register("spacy.TextCatEnsemble.v1") | ||||
| def build_text_classifier( | ||||
|     width: int, | ||||
|  | @ -158,11 +192,8 @@ def build_text_classifier( | |||
| 
 | ||||
| @registry.architectures.register("spacy.TextCatLowData.v1") | ||||
| def build_text_classifier_lowdata( | ||||
|     width: int, | ||||
|     pretrained_vectors: Optional[bool], | ||||
|     dropout: Optional[float], | ||||
|     nO: Optional[int] = None, | ||||
| ) -> Model: | ||||
|     width: int, dropout: Optional[float], nO: Optional[int] = None | ||||
| ) -> Model[List[Doc], Floats2d]: | ||||
|     # Don't document this yet, I'm not sure it's right. | ||||
|     # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" | ||||
|     with Model.define_operators({">>": chain, "**": clone}): | ||||
|  |  | |||
|  | @ -106,7 +106,7 @@ def MultiHashEmbed( | |||
| ) -> Model[List[Doc], List[Floats2d]]: | ||||
|     """Construct an embedding layer that separately embeds a number of lexical | ||||
|     attributes using hash embedding, concatenates the results, and passes it | ||||
|     through a feed-forward subnetwork to build a mixed representations. | ||||
|     through a feed-forward subnetwork to build a mixed representation. | ||||
| 
 | ||||
|     The features used can be configured with the 'attrs' argument. The suggested | ||||
|     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into | ||||
|  |  | |||
|  | @ -226,6 +226,9 @@ class AttributeRuler(Pipe): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/tagger#score | ||||
|         """ | ||||
|         def morph_key_getter(token, attr): | ||||
|             return getattr(token, attr).key | ||||
| 
 | ||||
|         validate_examples(examples, "AttributeRuler.score") | ||||
|         results = {} | ||||
|         attrs = set() | ||||
|  | @ -237,7 +240,8 @@ class AttributeRuler(Pipe): | |||
|             elif attr == POS: | ||||
|                 results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) | ||||
|             elif attr == MORPH: | ||||
|                 results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) | ||||
|                 results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||
|                 results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||
|             elif attr == LEMMA: | ||||
|                 results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) | ||||
|         return results | ||||
|  |  | |||
|  | @ -155,13 +155,16 @@ cdef class DependencyParser(Parser): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/dependencyparser#score | ||||
|         """ | ||||
|         def has_sents(doc): | ||||
|             return doc.has_annotation("SENT_START") | ||||
| 
 | ||||
|         validate_examples(examples, "DependencyParser.score") | ||||
|         def dep_getter(token, attr): | ||||
|             dep = getattr(token, attr) | ||||
|             dep = token.vocab.strings.as_string(dep).lower() | ||||
|             return dep | ||||
|         results = {} | ||||
|         results.update(Scorer.score_spans(examples, "sents", **kwargs)) | ||||
|         results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) | ||||
|         kwargs.setdefault("getter", dep_getter) | ||||
|         kwargs.setdefault("ignore_labels", ("p", "punct")) | ||||
|         results.update(Scorer.score_deps(examples, "dep", **kwargs)) | ||||
|  |  | |||
|  | @ -10,7 +10,7 @@ from ..errors import Errors | |||
| from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList | ||||
| from ..tokens import Doc, Span | ||||
| from ..matcher import Matcher, PhraseMatcher | ||||
| from ..scorer import Scorer | ||||
| from ..scorer import get_ner_prf | ||||
| from ..training import validate_examples | ||||
| 
 | ||||
| 
 | ||||
|  | @ -340,7 +340,7 @@ class EntityRuler(Pipe): | |||
| 
 | ||||
|     def score(self, examples, **kwargs): | ||||
|         validate_examples(examples, "EntityRuler.score") | ||||
|         return Scorer.score_spans(examples, "ents", **kwargs) | ||||
|         return get_ner_prf(examples) | ||||
| 
 | ||||
|     def from_bytes( | ||||
|         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() | ||||
|  |  | |||
|  | @ -251,10 +251,13 @@ class Morphologizer(Tagger): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/morphologizer#score | ||||
|         """ | ||||
|         def morph_key_getter(token, attr): | ||||
|             return getattr(token, attr).key | ||||
| 
 | ||||
|         validate_examples(examples, "Morphologizer.score") | ||||
|         results = {} | ||||
|         results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) | ||||
|         results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) | ||||
|         results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||
|         results.update(Scorer.score_token_attr_per_feat(examples, | ||||
|             "morph", **kwargs)) | ||||
|             "morph", getter=morph_key_getter, **kwargs)) | ||||
|         return results | ||||
|  |  | |||
|  | @ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser): | |||
|         DOCS: https://nightly.spacy.io/api/entityrecognizer#score | ||||
|         """ | ||||
|         validate_examples(examples, "EntityRecognizer.score") | ||||
|         score_per_type = get_ner_prf(examples) | ||||
|         totals = PRFScore() | ||||
|         for prf in score_per_type.values(): | ||||
|             totals += prf | ||||
|         return { | ||||
|             "ents_p": totals.precision, | ||||
|             "ents_r": totals.recall, | ||||
|             "ents_f": totals.fscore, | ||||
|             "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, | ||||
|         } | ||||
|         return get_ner_prf(examples) | ||||
|  |  | |||
|  | @ -155,8 +155,11 @@ class Sentencizer(Pipe): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/sentencizer#score | ||||
|         """ | ||||
|         def has_sents(doc): | ||||
|             return doc.has_annotation("SENT_START") | ||||
| 
 | ||||
|         validate_examples(examples, "Sentencizer.score") | ||||
|         results = Scorer.score_spans(examples, "sents", **kwargs) | ||||
|         results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) | ||||
|         del results["sents_per_type"] | ||||
|         return results | ||||
| 
 | ||||
|  |  | |||
|  | @ -160,7 +160,10 @@ class SentenceRecognizer(Tagger): | |||
|         RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. | ||||
|         DOCS: https://nightly.spacy.io/api/sentencerecognizer#score | ||||
|         """ | ||||
|         def has_sents(doc): | ||||
|             return doc.has_annotation("SENT_START") | ||||
| 
 | ||||
|         validate_examples(examples, "SentenceRecognizer.score") | ||||
|         results = Scorer.score_spans(examples, "sents", **kwargs) | ||||
|         results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) | ||||
|         del results["sents_per_type"] | ||||
|         return results | ||||
|  |  | |||
|  | @ -16,15 +16,30 @@ from ..vocab import Vocab | |||
| 
 | ||||
| default_model_config = """ | ||||
| [model] | ||||
| @architectures = "spacy.TextCatEnsemble.v1" | ||||
| exclusive_classes = false | ||||
| pretrained_vectors = null | ||||
| @architectures = "spacy.TextCatEnsemble.v2" | ||||
| 
 | ||||
| [model.tok2vec] | ||||
| @architectures = "spacy.Tok2Vec.v1" | ||||
| 
 | ||||
| [model.tok2vec.embed] | ||||
| @architectures = "spacy.MultiHashEmbed.v1" | ||||
| width = 64 | ||||
| conv_depth = 2 | ||||
| embed_size = 2000 | ||||
| rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||
| attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||
| include_static_vectors = false | ||||
| 
 | ||||
| [model.tok2vec.encode] | ||||
| @architectures = "spacy.MaxoutWindowEncoder.v1" | ||||
| width = ${model.tok2vec.embed.width} | ||||
| window_size = 1 | ||||
| maxout_pieces = 3 | ||||
| depth = 2 | ||||
| 
 | ||||
| [model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| dropout = null | ||||
| no_output_layer = false | ||||
| """ | ||||
| DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] | ||||
| 
 | ||||
|  | @ -60,9 +75,11 @@ subword_features = true | |||
|     default_score_weights={ | ||||
|         "cats_score": 1.0, | ||||
|         "cats_score_desc": None, | ||||
|         "cats_p": None, | ||||
|         "cats_r": None, | ||||
|         "cats_f": None, | ||||
|         "cats_micro_p": None, | ||||
|         "cats_micro_r": None, | ||||
|         "cats_micro_f": None, | ||||
|         "cats_macro_p": None, | ||||
|         "cats_macro_r": None, | ||||
|         "cats_macro_f": None, | ||||
|         "cats_macro_auc": None, | ||||
|         "cats_f_per_type": None, | ||||
|  |  | |||
							
								
								
									
										287
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						
									
										287
									
								
								spacy/scorer.py
									
									
									
									
									
								
							|  | @ -1,9 +1,9 @@ | |||
| from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING | ||||
| from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING | ||||
| import numpy as np | ||||
| from collections import defaultdict | ||||
| 
 | ||||
| from .training import Example | ||||
| from .tokens import Token, Doc, Span | ||||
| from .tokens import Token, Doc, Span, MorphAnalysis | ||||
| from .errors import Errors | ||||
| from .util import get_lang_class, SimpleFrozenList | ||||
| from .morphology import Morphology | ||||
|  | @ -13,7 +13,8 @@ if TYPE_CHECKING: | |||
|     from .language import Language  # noqa: F401 | ||||
| 
 | ||||
| 
 | ||||
| DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] | ||||
| DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat") | ||||
| MISSING_VALUES = frozenset([None, 0, ""]) | ||||
| 
 | ||||
| 
 | ||||
| class PRFScore: | ||||
|  | @ -24,6 +25,9 @@ class PRFScore: | |||
|         self.fp = 0 | ||||
|         self.fn = 0 | ||||
| 
 | ||||
|     def __len__(self) -> int: | ||||
|         return self.tp + self.fp + self.fn | ||||
| 
 | ||||
|     def __iadd__(self, other): | ||||
|         self.tp += other.tp | ||||
|         self.fp += other.fp | ||||
|  | @ -59,7 +63,9 @@ class PRFScore: | |||
| 
 | ||||
| 
 | ||||
| class ROCAUCScore: | ||||
|     """An AUC ROC score.""" | ||||
|     """An AUC ROC score. This is only defined for binary classification. | ||||
|     Use the method is_binary before calculating the score, otherwise it | ||||
|     may throw an error.""" | ||||
| 
 | ||||
|     def __init__(self) -> None: | ||||
|         self.golds = [] | ||||
|  | @ -71,16 +77,16 @@ class ROCAUCScore: | |||
|         self.cands.append(cand) | ||||
|         self.golds.append(gold) | ||||
| 
 | ||||
|     def is_binary(self): | ||||
|         return len(np.unique(self.golds)) == 2 | ||||
| 
 | ||||
|     @property | ||||
|     def score(self): | ||||
|         if not self.is_binary(): | ||||
|             raise ValueError(Errors.E165.format(label=set(self.golds))) | ||||
|         if len(self.golds) == self.saved_score_at_len: | ||||
|             return self.saved_score | ||||
|         try: | ||||
|             self.saved_score = _roc_auc_score(self.golds, self.cands) | ||||
|         # catch ValueError: Only one class present in y_true. | ||||
|         # ROC AUC score is not defined in that case. | ||||
|         except ValueError: | ||||
|             self.saved_score = -float("inf") | ||||
|         self.saved_score = _roc_auc_score(self.golds, self.cands) | ||||
|         self.saved_score_at_len = len(self.golds) | ||||
|         return self.saved_score | ||||
| 
 | ||||
|  | @ -92,7 +98,7 @@ class Scorer: | |||
|         self, | ||||
|         nlp: Optional["Language"] = None, | ||||
|         default_lang: str = "xx", | ||||
|         default_pipeline=DEFAULT_PIPELINE, | ||||
|         default_pipeline: Iterable[str] = DEFAULT_PIPELINE, | ||||
|         **cfg, | ||||
|     ) -> None: | ||||
|         """Initialize the Scorer. | ||||
|  | @ -124,13 +130,13 @@ class Scorer: | |||
|         return scores | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: | ||||
|     def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]: | ||||
|         """Returns accuracy and PRF scores for tokenization. | ||||
|         * token_acc: # correct tokens / # gold tokens | ||||
|         * token_p/r/f: PRF for token character spans | ||||
| 
 | ||||
|         examples (Iterable[Example]): Examples to score | ||||
|         RETURNS (Dict[str, float]): A dictionary containing the scores | ||||
|         RETURNS (Dict[str, Any]): A dictionary containing the scores | ||||
|             token_acc/p/r/f. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/scorer#score_tokenization | ||||
|  | @ -140,6 +146,8 @@ class Scorer: | |||
|         for example in examples: | ||||
|             gold_doc = example.reference | ||||
|             pred_doc = example.predicted | ||||
|             if gold_doc.has_unknown_spaces: | ||||
|                 continue | ||||
|             align = example.alignment | ||||
|             gold_spans = set() | ||||
|             pred_spans = set() | ||||
|  | @ -156,12 +164,20 @@ class Scorer: | |||
|                 else: | ||||
|                     acc_score.tp += 1 | ||||
|             prf_score.score_set(pred_spans, gold_spans) | ||||
|         return { | ||||
|             "token_acc": acc_score.fscore, | ||||
|             "token_p": prf_score.precision, | ||||
|             "token_r": prf_score.recall, | ||||
|             "token_f": prf_score.fscore, | ||||
|         } | ||||
|         if len(acc_score) > 0: | ||||
|             return { | ||||
|                 "token_acc": acc_score.fscore, | ||||
|                 "token_p": prf_score.precision, | ||||
|                 "token_r": prf_score.recall, | ||||
|                 "token_f": prf_score.fscore, | ||||
|             } | ||||
|         else: | ||||
|             return { | ||||
|                 "token_acc": None, | ||||
|                 "token_p": None, | ||||
|                 "token_r": None, | ||||
|                 "token_f": None | ||||
|             } | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def score_token_attr( | ||||
|  | @ -169,8 +185,9 @@ class Scorer: | |||
|         attr: str, | ||||
|         *, | ||||
|         getter: Callable[[Token, str], Any] = getattr, | ||||
|         missing_values: Set[Any] = MISSING_VALUES, | ||||
|         **cfg, | ||||
|     ) -> Dict[str, float]: | ||||
|     ) -> Dict[str, Any]: | ||||
|         """Returns an accuracy score for a token-level attribute. | ||||
| 
 | ||||
|         examples (Iterable[Example]): Examples to score | ||||
|  | @ -178,7 +195,7 @@ class Scorer: | |||
|         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, | ||||
|             getter(token, attr) should return the value of the attribute for an | ||||
|             individual token. | ||||
|         RETURNS (Dict[str, float]): A dictionary containing the accuracy score | ||||
|         RETURNS (Dict[str, Any]): A dictionary containing the accuracy score | ||||
|             under the key attr_acc. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/scorer#score_token_attr | ||||
|  | @ -189,17 +206,27 @@ class Scorer: | |||
|             pred_doc = example.predicted | ||||
|             align = example.alignment | ||||
|             gold_tags = set() | ||||
|             missing_indices = set() | ||||
|             for gold_i, token in enumerate(gold_doc): | ||||
|                 gold_tags.add((gold_i, getter(token, attr))) | ||||
|                 value = getter(token, attr) | ||||
|                 if value not in missing_values: | ||||
|                     gold_tags.add((gold_i, getter(token, attr))) | ||||
|                 else: | ||||
|                     missing_indices.add(gold_i) | ||||
|             pred_tags = set() | ||||
|             for token in pred_doc: | ||||
|                 if token.orth_.isspace(): | ||||
|                     continue | ||||
|                 if align.x2y.lengths[token.i] == 1: | ||||
|                     gold_i = align.x2y[token.i].dataXd[0, 0] | ||||
|                     pred_tags.add((gold_i, getter(token, attr))) | ||||
|                     if gold_i not in missing_indices: | ||||
|                         pred_tags.add((gold_i, getter(token, attr))) | ||||
|             tag_score.score_set(pred_tags, gold_tags) | ||||
|         return {f"{attr}_acc": tag_score.fscore} | ||||
|         score_key = f"{attr}_acc" | ||||
|         if len(tag_score) == 0: | ||||
|             return {score_key: None} | ||||
|         else: | ||||
|             return {score_key: tag_score.fscore} | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def score_token_attr_per_feat( | ||||
|  | @ -207,8 +234,9 @@ class Scorer: | |||
|         attr: str, | ||||
|         *, | ||||
|         getter: Callable[[Token, str], Any] = getattr, | ||||
|         missing_values: Set[Any] = MISSING_VALUES, | ||||
|         **cfg, | ||||
|     ): | ||||
|     ) -> Dict[str, Any]: | ||||
|         """Return PRF scores per feat for a token attribute in UFEATS format. | ||||
| 
 | ||||
|         examples (Iterable[Example]): Examples to score | ||||
|  | @ -216,7 +244,7 @@ class Scorer: | |||
|         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, | ||||
|             getter(token, attr) should return the value of the attribute for an | ||||
|             individual token. | ||||
|         RETURNS (dict): A dictionary containing the per-feat PRF scores unders | ||||
|         RETURNS (dict): A dictionary containing the per-feat PRF scores under | ||||
|             the key attr_per_feat. | ||||
|         """ | ||||
|         per_feat = {} | ||||
|  | @ -225,9 +253,11 @@ class Scorer: | |||
|             gold_doc = example.reference | ||||
|             align = example.alignment | ||||
|             gold_per_feat = {} | ||||
|             missing_indices = set() | ||||
|             for gold_i, token in enumerate(gold_doc): | ||||
|                 morph = str(getter(token, attr)) | ||||
|                 if morph: | ||||
|                 value = getter(token, attr) | ||||
|                 morph = gold_doc.vocab.strings[value] | ||||
|                 if value not in missing_values and morph != Morphology.EMPTY_MORPH: | ||||
|                     for feat in morph.split(Morphology.FEATURE_SEP): | ||||
|                         field, values = feat.split(Morphology.FIELD_SEP) | ||||
|                         if field not in per_feat: | ||||
|  | @ -235,27 +265,35 @@ class Scorer: | |||
|                         if field not in gold_per_feat: | ||||
|                             gold_per_feat[field] = set() | ||||
|                         gold_per_feat[field].add((gold_i, feat)) | ||||
|                 else: | ||||
|                     missing_indices.add(gold_i) | ||||
|             pred_per_feat = {} | ||||
|             for token in pred_doc: | ||||
|                 if token.orth_.isspace(): | ||||
|                     continue | ||||
|                 if align.x2y.lengths[token.i] == 1: | ||||
|                     gold_i = align.x2y[token.i].dataXd[0, 0] | ||||
|                     morph = str(getter(token, attr)) | ||||
|                     if morph: | ||||
|                         for feat in morph.split("|"): | ||||
|                             field, values = feat.split("=") | ||||
|                             if field not in per_feat: | ||||
|                                 per_feat[field] = PRFScore() | ||||
|                             if field not in pred_per_feat: | ||||
|                                 pred_per_feat[field] = set() | ||||
|                             pred_per_feat[field].add((gold_i, feat)) | ||||
|                     if gold_i not in missing_indices: | ||||
|                         value = getter(token, attr) | ||||
|                         morph = gold_doc.vocab.strings[value] | ||||
|                         if value not in missing_values and morph != Morphology.EMPTY_MORPH: | ||||
|                             for feat in morph.split(Morphology.FEATURE_SEP): | ||||
|                                 field, values = feat.split(Morphology.FIELD_SEP) | ||||
|                                 if field not in per_feat: | ||||
|                                     per_feat[field] = PRFScore() | ||||
|                                 if field not in pred_per_feat: | ||||
|                                     pred_per_feat[field] = set() | ||||
|                                 pred_per_feat[field].add((gold_i, feat)) | ||||
|             for field in per_feat: | ||||
|                 per_feat[field].score_set( | ||||
|                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) | ||||
|                 ) | ||||
|         result = {k: v.to_dict() for k, v in per_feat.items()} | ||||
|         return {f"{attr}_per_feat": result} | ||||
|         score_key = f"{attr}_per_feat" | ||||
|         if any([len(v) for v in per_feat.values()]): | ||||
|             result = {k: v.to_dict() for k, v in per_feat.items()} | ||||
|             return {score_key: result} | ||||
|         else: | ||||
|             return {score_key: None} | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def score_spans( | ||||
|  | @ -263,6 +301,7 @@ class Scorer: | |||
|         attr: str, | ||||
|         *, | ||||
|         getter: Callable[[Doc, str], Iterable[Span]] = getattr, | ||||
|         has_annotation: Optional[Callable[[Doc], bool]] = None, | ||||
|         **cfg, | ||||
|     ) -> Dict[str, Any]: | ||||
|         """Returns PRF scores for labeled spans. | ||||
|  | @ -282,18 +321,10 @@ class Scorer: | |||
|         for example in examples: | ||||
|             pred_doc = example.predicted | ||||
|             gold_doc = example.reference | ||||
|             # TODO | ||||
|             # This is a temporary hack to work around the problem that the scorer | ||||
|             # fails if you have examples that are not fully annotated for all | ||||
|             # the tasks in your pipeline. For instance, you might have a corpus | ||||
|             # of NER annotations that does not set sentence boundaries, but the | ||||
|             # pipeline includes a parser or senter, and then the score_weights | ||||
|             # are used to evaluate that component. When the scorer attempts | ||||
|             # to read the sentences from the gold document, it fails. | ||||
|             try: | ||||
|                 list(getter(gold_doc, attr)) | ||||
|             except ValueError: | ||||
|                 continue | ||||
|             # Option to handle docs without sents | ||||
|             if has_annotation is not None: | ||||
|                 if not has_annotation(gold_doc): | ||||
|                     continue | ||||
|             # Find all labels in gold and doc | ||||
|             labels = set( | ||||
|                 [k.label_ for k in getter(gold_doc, attr)] | ||||
|  | @ -321,13 +352,21 @@ class Scorer: | |||
|                     v.score_set(pred_per_type[k], gold_per_type[k]) | ||||
|             # Score for all labels | ||||
|             score.score_set(pred_spans, gold_spans) | ||||
|         results = { | ||||
|             f"{attr}_p": score.precision, | ||||
|             f"{attr}_r": score.recall, | ||||
|             f"{attr}_f": score.fscore, | ||||
|             f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, | ||||
|         } | ||||
|         return results | ||||
|         if len(score) > 0: | ||||
|             return { | ||||
|                 f"{attr}_p": score.precision, | ||||
|                 f"{attr}_r": score.recall, | ||||
|                 f"{attr}_f": score.fscore, | ||||
|                 f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, | ||||
|             } | ||||
|         else: | ||||
|             return { | ||||
|                 f"{attr}_p": None, | ||||
|                 f"{attr}_r": None, | ||||
|                 f"{attr}_f": None, | ||||
|                 f"{attr}_per_type": None, | ||||
|             } | ||||
| 
 | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def score_cats( | ||||
|  | @ -362,9 +401,13 @@ class Scorer: | |||
|             for all: | ||||
|                 attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), | ||||
|                 attr_score_desc (text description of the overall score), | ||||
|                 attr_micro_p, | ||||
|                 attr_micro_r, | ||||
|                 attr_micro_f, | ||||
|                 attr_macro_p, | ||||
|                 attr_macro_r, | ||||
|                 attr_macro_f, | ||||
|                 attr_auc, | ||||
|                 attr_macro_auc, | ||||
|                 attr_f_per_type, | ||||
|                 attr_auc_per_type | ||||
| 
 | ||||
|  | @ -384,9 +427,6 @@ class Scorer: | |||
|             pred_cats = getter(example.predicted, attr) | ||||
|             gold_cats = getter(example.reference, attr) | ||||
| 
 | ||||
|             # I think the AUC metric is applicable regardless of whether we're | ||||
|             # doing multi-label classification? Unsure. If not, move this into | ||||
|             # the elif pred_cats and gold_cats block below. | ||||
|             for label in labels: | ||||
|                 pred_score = pred_cats.get(label, 0.0) | ||||
|                 gold_score = gold_cats.get(label, 0.0) | ||||
|  | @ -431,7 +471,9 @@ class Scorer: | |||
|         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats | ||||
|         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats | ||||
|         macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats | ||||
|         macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats | ||||
|         # Limit macro_auc to those labels with gold annotations, | ||||
|         # but still divide by all cats to avoid artificial boosting of datasets with missing labels | ||||
|         macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats | ||||
|         results = { | ||||
|             f"{attr}_score": None, | ||||
|             f"{attr}_score_desc": None, | ||||
|  | @ -443,7 +485,7 @@ class Scorer: | |||
|             f"{attr}_macro_f": macro_f, | ||||
|             f"{attr}_macro_auc": macro_auc, | ||||
|             f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, | ||||
|             f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, | ||||
|             f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()}, | ||||
|         } | ||||
|         if len(labels) == 2 and not multi_label and positive_label: | ||||
|             positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] | ||||
|  | @ -534,6 +576,7 @@ class Scorer: | |||
|         head_attr: str = "head", | ||||
|         head_getter: Callable[[Token, str], Token] = getattr, | ||||
|         ignore_labels: Iterable[str] = SimpleFrozenList(), | ||||
|         missing_values: Set[Any] = MISSING_VALUES, | ||||
|         **cfg, | ||||
|     ) -> Dict[str, Any]: | ||||
|         """Returns the UAS, LAS, and LAS per type scores for dependency | ||||
|  | @ -558,6 +601,7 @@ class Scorer: | |||
|         unlabelled = PRFScore() | ||||
|         labelled = PRFScore() | ||||
|         labelled_per_dep = dict() | ||||
|         missing_indices = set() | ||||
|         for example in examples: | ||||
|             gold_doc = example.reference | ||||
|             pred_doc = example.predicted | ||||
|  | @ -567,13 +611,16 @@ class Scorer: | |||
|             for gold_i, token in enumerate(gold_doc): | ||||
|                 dep = getter(token, attr) | ||||
|                 head = head_getter(token, head_attr) | ||||
|                 if dep not in ignore_labels: | ||||
|                     gold_deps.add((gold_i, head.i, dep)) | ||||
|                     if dep not in labelled_per_dep: | ||||
|                         labelled_per_dep[dep] = PRFScore() | ||||
|                     if dep not in gold_deps_per_dep: | ||||
|                         gold_deps_per_dep[dep] = set() | ||||
|                     gold_deps_per_dep[dep].add((gold_i, head.i, dep)) | ||||
|                 if dep not in missing_values: | ||||
|                     if dep not in ignore_labels: | ||||
|                         gold_deps.add((gold_i, head.i, dep)) | ||||
|                         if dep not in labelled_per_dep: | ||||
|                             labelled_per_dep[dep] = PRFScore() | ||||
|                         if dep not in gold_deps_per_dep: | ||||
|                             gold_deps_per_dep[dep] = set() | ||||
|                         gold_deps_per_dep[dep].add((gold_i, head.i, dep)) | ||||
|                 else: | ||||
|                     missing_indices.add(gold_i) | ||||
|             pred_deps = set() | ||||
|             pred_deps_per_dep = {} | ||||
|             for token in pred_doc: | ||||
|  | @ -583,25 +630,26 @@ class Scorer: | |||
|                     gold_i = None | ||||
|                 else: | ||||
|                     gold_i = align.x2y[token.i].dataXd[0, 0] | ||||
|                 dep = getter(token, attr) | ||||
|                 head = head_getter(token, head_attr) | ||||
|                 if dep not in ignore_labels and token.orth_.strip(): | ||||
|                     if align.x2y.lengths[head.i] == 1: | ||||
|                         gold_head = align.x2y[head.i].dataXd[0, 0] | ||||
|                     else: | ||||
|                         gold_head = None | ||||
|                     # None is indistinct, so we can't just add it to the set | ||||
|                     # Multiple (None, None) deps are possible | ||||
|                     if gold_i is None or gold_head is None: | ||||
|                         unlabelled.fp += 1 | ||||
|                         labelled.fp += 1 | ||||
|                     else: | ||||
|                         pred_deps.add((gold_i, gold_head, dep)) | ||||
|                         if dep not in labelled_per_dep: | ||||
|                             labelled_per_dep[dep] = PRFScore() | ||||
|                         if dep not in pred_deps_per_dep: | ||||
|                             pred_deps_per_dep[dep] = set() | ||||
|                         pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) | ||||
|                 if gold_i not in missing_indices: | ||||
|                     dep = getter(token, attr) | ||||
|                     head = head_getter(token, head_attr) | ||||
|                     if dep not in ignore_labels and token.orth_.strip(): | ||||
|                         if align.x2y.lengths[head.i] == 1: | ||||
|                             gold_head = align.x2y[head.i].dataXd[0, 0] | ||||
|                         else: | ||||
|                             gold_head = None | ||||
|                         # None is indistinct, so we can't just add it to the set | ||||
|                         # Multiple (None, None) deps are possible | ||||
|                         if gold_i is None or gold_head is None: | ||||
|                             unlabelled.fp += 1 | ||||
|                             labelled.fp += 1 | ||||
|                         else: | ||||
|                             pred_deps.add((gold_i, gold_head, dep)) | ||||
|                             if dep not in labelled_per_dep: | ||||
|                                 labelled_per_dep[dep] = PRFScore() | ||||
|                             if dep not in pred_deps_per_dep: | ||||
|                                 pred_deps_per_dep[dep] = set() | ||||
|                             pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) | ||||
|             labelled.score_set(pred_deps, gold_deps) | ||||
|             for dep in labelled_per_dep: | ||||
|                 labelled_per_dep[dep].score_set( | ||||
|  | @ -610,29 +658,34 @@ class Scorer: | |||
|             unlabelled.score_set( | ||||
|                 set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) | ||||
|             ) | ||||
|         return { | ||||
|             f"{attr}_uas": unlabelled.fscore, | ||||
|             f"{attr}_las": labelled.fscore, | ||||
|             f"{attr}_las_per_type": { | ||||
|                 k: v.to_dict() for k, v in labelled_per_dep.items() | ||||
|             }, | ||||
|         } | ||||
|         if len(unlabelled) > 0: | ||||
|             return { | ||||
|                 f"{attr}_uas": unlabelled.fscore, | ||||
|                 f"{attr}_las": labelled.fscore, | ||||
|                 f"{attr}_las_per_type": { | ||||
|                     k: v.to_dict() for k, v in labelled_per_dep.items() | ||||
|                 }, | ||||
|             } | ||||
|         else: | ||||
|             return { | ||||
|                 f"{attr}_uas": None, | ||||
|                 f"{attr}_las": None, | ||||
|                 f"{attr}_las_per_type": None, | ||||
|             } | ||||
| 
 | ||||
| 
 | ||||
| def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: | ||||
|     """Compute per-entity PRFScore objects for a sequence of examples. The | ||||
|     results are returned as a dictionary keyed by the entity type. You can | ||||
|     add the PRFScore objects to get micro-averaged total. | ||||
| def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: | ||||
|     """Compute micro-PRF and per-entity PRF scores for a sequence of examples. | ||||
|     """ | ||||
|     scores = defaultdict(PRFScore) | ||||
|     score_per_type = defaultdict(PRFScore) | ||||
|     for eg in examples: | ||||
|         if not eg.y.has_annotation("ENT_IOB"): | ||||
|             continue | ||||
|         golds = {(e.label_, e.start, e.end) for e in eg.y.ents} | ||||
|         align_x2y = eg.alignment.x2y | ||||
|         for pred_ent in eg.x.ents: | ||||
|             if pred_ent.label_ not in scores: | ||||
|                 scores[pred_ent.label_] = PRFScore() | ||||
|             if pred_ent.label_ not in score_per_type: | ||||
|                 score_per_type[pred_ent.label_] = PRFScore() | ||||
|             indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() | ||||
|             if len(indices): | ||||
|                 g_span = eg.y[indices[0] : indices[-1] + 1] | ||||
|  | @ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: | |||
|                 if all(token.ent_iob != 0 for token in g_span): | ||||
|                     key = (pred_ent.label_, indices[0], indices[-1] + 1) | ||||
|                     if key in golds: | ||||
|                         scores[pred_ent.label_].tp += 1 | ||||
|                         score_per_type[pred_ent.label_].tp += 1 | ||||
|                         golds.remove(key) | ||||
|                     else: | ||||
|                         scores[pred_ent.label_].fp += 1 | ||||
|                         score_per_type[pred_ent.label_].fp += 1 | ||||
|         for label, start, end in golds: | ||||
|             scores[label].fn += 1 | ||||
|     return scores | ||||
|             score_per_type[label].fn += 1 | ||||
|     totals = PRFScore() | ||||
|     for prf in score_per_type.values(): | ||||
|         totals += prf | ||||
|     if len(totals) > 0: | ||||
|         return { | ||||
|             "ents_p": totals.precision, | ||||
|             "ents_r": totals.recall, | ||||
|             "ents_f": totals.fscore, | ||||
|             "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, | ||||
|         } | ||||
|     else: | ||||
|         return { | ||||
|             "ents_p": None, | ||||
|             "ents_r": None, | ||||
|             "ents_f": None, | ||||
|             "ents_per_type": None, | ||||
|         } | ||||
| 
 | ||||
| 
 | ||||
| ############################################################################# | ||||
|  | @ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score): | |||
|             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ | ||||
|     """ | ||||
|     if len(np.unique(y_true)) != 2: | ||||
|         raise ValueError(Errors.E165) | ||||
|         raise ValueError(Errors.E165.format(label=np.unique(y_true))) | ||||
|     fpr, tpr, _ = _roc_curve(y_true, y_score) | ||||
|     return _auc(fpr, tpr) | ||||
| 
 | ||||
|  |  | |||
|  | @ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc): | |||
|     pattern = [ | ||||
|         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}}, | ||||
|     ] | ||||
|     nomatch_pattern = [ | ||||
|         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}}, | ||||
|     ] | ||||
| 
 | ||||
|     matcher = DependencyMatcher(en_vocab) | ||||
|     mock = Mock() | ||||
|     matcher.add("pattern", [pattern], on_match=mock) | ||||
|     matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == 1 | ||||
|     mock.assert_called_once_with(matcher, doc, 0, matches) | ||||
| 
 | ||||
|     # check that matches with and without callback are the same (#4590) | ||||
|  |  | |||
|  | @ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts): | |||
|     scores = nlp.evaluate(dev_examples) | ||||
|     # "cat" is the only correct lemma | ||||
|     assert scores["lemma_acc"] == pytest.approx(0.2) | ||||
|     # the empty morphs are correct | ||||
|     assert scores["morph_acc"] == pytest.approx(0.6) | ||||
|     # no morphs are set | ||||
|     assert scores["morph_acc"] == None | ||||
| 
 | ||||
| 
 | ||||
| def test_attributeruler_rule_order(nlp): | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ import pytest | |||
| from spacy.language import Language | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.de import German | ||||
| from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL | ||||
| from spacy.tokens import Doc | ||||
| from spacy.util import registry, SimpleFrozenDict, combine_score_weights | ||||
| from thinc.api import Model, Linear, ConfigValidationError | ||||
|  | @ -156,15 +157,10 @@ def test_pipe_class_component_model(): | |||
|     name = "test_class_component_model" | ||||
|     default_config = { | ||||
|         "model": { | ||||
|             "@architectures": "spacy.TextCatEnsemble.v1", | ||||
|             "exclusive_classes": False, | ||||
|             "pretrained_vectors": None, | ||||
|             "width": 64, | ||||
|             "embed_size": 2000, | ||||
|             "window_size": 1, | ||||
|             "conv_depth": 2, | ||||
|             "ngram_size": 1, | ||||
|             "dropout": None, | ||||
|             "@architectures": "spacy.TextCatEnsemble.v2", | ||||
|             "tok2vec": DEFAULT_TOK2VEC_MODEL, | ||||
|             "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, | ||||
|                       "no_output_layer": False}, | ||||
|         }, | ||||
|         "value1": 10, | ||||
|     } | ||||
|  |  | |||
|  | @ -140,7 +140,7 @@ def test_overfitting_IO(): | |||
|     nlp = English() | ||||
|     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} | ||||
|     # Set exclusive labels | ||||
|     config = {"model": {"exclusive_classes": True}} | ||||
|     config = {"model": {"linear_model": {"exclusive_classes": True}}} | ||||
|     textcat = nlp.add_pipe("textcat", config=config) | ||||
|     train_examples = [] | ||||
|     for text, annotations in TRAIN_DATA: | ||||
|  | @ -192,9 +192,8 @@ def test_overfitting_IO(): | |||
|         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, | ||||
|         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, | ||||
|         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, | ||||
|         {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, | ||||
|         {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, | ||||
|         {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, | ||||
|         {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}, | ||||
|         {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}, | ||||
|         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, | ||||
|         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, | ||||
|     ], | ||||
|  |  | |||
|  | @ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate | |||
| from numpy.testing import assert_array_equal | ||||
| import numpy | ||||
| from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder | ||||
| from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier | ||||
| from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier | ||||
| from spacy.ml.staticvectors import StaticVectors | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.en.examples import sentences as EN_SENTENCES | ||||
| 
 | ||||
| 
 | ||||
| def get_textcat_kwargs(): | ||||
| def get_textcat_bow_kwargs(): | ||||
|     return { | ||||
|         "width": 64, | ||||
|         "embed_size": 2000, | ||||
|         "pretrained_vectors": None, | ||||
|         "exclusive_classes": False, | ||||
|         "exclusive_classes": True, | ||||
|         "ngram_size": 1, | ||||
|         "window_size": 1, | ||||
|         "conv_depth": 2, | ||||
|         "dropout": None, | ||||
|         "nO": 7, | ||||
|         "no_output_layer": False, | ||||
|         "nO": 34, | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def get_textcat_cnn_kwargs(): | ||||
|     return { | ||||
|         "tok2vec": test_tok2vec(), | ||||
|         "exclusive_classes": False, | ||||
|         "nO": 13, | ||||
|     } | ||||
|     return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} | ||||
| 
 | ||||
| 
 | ||||
| def get_all_params(model): | ||||
|  | @ -105,7 +96,7 @@ def test_multi_hash_embed(): | |||
|     "seed,model_func,kwargs", | ||||
|     [ | ||||
|         (0, build_Tok2Vec_model, get_tok2vec_kwargs()), | ||||
|         (0, build_text_classifier, get_textcat_kwargs()), | ||||
|         (0, build_bow_text_classifier, get_textcat_bow_kwargs()), | ||||
|         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), | ||||
|     ], | ||||
| ) | ||||
|  | @ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs): | |||
|     "seed,model_func,kwargs,get_X", | ||||
|     [ | ||||
|         (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), | ||||
|         (0, build_text_classifier, get_textcat_kwargs(), get_docs), | ||||
|         (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), | ||||
|         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), | ||||
|     ], | ||||
| ) | ||||
|  | @ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): | |||
|     "seed,dropout,model_func,kwargs,get_X", | ||||
|     [ | ||||
|         (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), | ||||
|         (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), | ||||
|         (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), | ||||
|         (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), | ||||
|     ], | ||||
| ) | ||||
|  |  | |||
|  | @ -277,6 +277,62 @@ def test_tag_score(tagged_doc): | |||
|     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) | ||||
| 
 | ||||
| 
 | ||||
| def test_partial_annotation(en_tokenizer): | ||||
|     pred_doc = en_tokenizer("a b c d e") | ||||
|     pred_doc[0].tag_ = "A" | ||||
|     pred_doc[0].pos_ = "X" | ||||
|     pred_doc[0].set_morph("Feat=Val") | ||||
|     pred_doc[0].dep_ = "dep" | ||||
| 
 | ||||
|     # unannotated reference | ||||
|     ref_doc = en_tokenizer("a b c d e") | ||||
|     ref_doc.has_unknown_spaces = True | ||||
|     example = Example(pred_doc, ref_doc) | ||||
|     scorer = Scorer() | ||||
|     scores = scorer.score([example]) | ||||
|     for key in scores: | ||||
|         # cats doesn't have an unset state | ||||
|         if key.startswith("cats"): | ||||
|             continue | ||||
|         assert scores[key] == None | ||||
| 
 | ||||
|     # partially annotated reference, not overlapping with predicted annotation | ||||
|     ref_doc = en_tokenizer("a b c d e") | ||||
|     ref_doc.has_unknown_spaces = True | ||||
|     ref_doc[1].tag_ = "A" | ||||
|     ref_doc[1].pos_ = "X" | ||||
|     ref_doc[1].set_morph("Feat=Val") | ||||
|     ref_doc[1].dep_ = "dep" | ||||
|     example = Example(pred_doc, ref_doc) | ||||
|     scorer = Scorer() | ||||
|     scores = scorer.score([example]) | ||||
|     assert scores["token_acc"] == None | ||||
|     assert scores["tag_acc"] == 0.0 | ||||
|     assert scores["pos_acc"] == 0.0 | ||||
|     assert scores["morph_acc"] == 0.0 | ||||
|     assert scores["dep_uas"] == 1.0 | ||||
|     assert scores["dep_las"] == 0.0 | ||||
|     assert scores["sents_f"] == None | ||||
| 
 | ||||
|     # partially annotated reference, overlapping with predicted annotation | ||||
|     ref_doc = en_tokenizer("a b c d e") | ||||
|     ref_doc.has_unknown_spaces = True | ||||
|     ref_doc[0].tag_ = "A" | ||||
|     ref_doc[0].pos_ = "X" | ||||
|     ref_doc[1].set_morph("Feat=Val") | ||||
|     ref_doc[1].dep_ = "dep" | ||||
|     example = Example(pred_doc, ref_doc) | ||||
|     scorer = Scorer() | ||||
|     scores = scorer.score([example]) | ||||
|     assert scores["token_acc"] == None | ||||
|     assert scores["tag_acc"] == 1.0 | ||||
|     assert scores["pos_acc"] == 1.0 | ||||
|     assert scores["morph_acc"] == 0.0 | ||||
|     assert scores["dep_uas"] == 1.0 | ||||
|     assert scores["dep_las"] == 0.0 | ||||
|     assert scores["sents_f"] == None | ||||
| 
 | ||||
| 
 | ||||
| def test_roc_auc_score(): | ||||
|     # Binary classification, toy tests from scikit-learn test suite | ||||
|     y_true = [0, 1] | ||||
|  | @ -334,7 +390,8 @@ def test_roc_auc_score(): | |||
|     score = ROCAUCScore() | ||||
|     score.score_set(0.25, 0) | ||||
|     score.score_set(0.75, 0) | ||||
|     assert score.score == -float("inf") | ||||
|     with pytest.raises(ValueError): | ||||
|         s = score.score | ||||
| 
 | ||||
|     y_true = [1, 1] | ||||
|     y_score = [0.25, 0.75] | ||||
|  | @ -344,4 +401,5 @@ def test_roc_auc_score(): | |||
|     score = ROCAUCScore() | ||||
|     score.score_set(0.25, 1) | ||||
|     score.score_set(0.75, 1) | ||||
|     assert score.score == -float("inf") | ||||
|     with pytest.raises(ValueError): | ||||
|         s = score.score | ||||
|  |  | |||
|  | @ -51,7 +51,7 @@ def test_readers(): | |||
|     for example in train_corpus(nlp): | ||||
|         nlp.update([example], sgd=optimizer) | ||||
|     scores = nlp.evaluate(list(dev_corpus(nlp))) | ||||
|     assert scores["cats_score"] | ||||
|     assert scores["cats_score"] == 0.0 | ||||
|     # ensure the pipeline runs | ||||
|     doc = nlp("Quick test") | ||||
|     assert doc.cats | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ import numpy | |||
| from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment | ||||
| from spacy.training import biluo_tags_to_spans, iob_to_biluo | ||||
| from spacy.training import Corpus, docs_to_json, Example | ||||
| from spacy.training.align import get_alignments | ||||
| from spacy.training.converters import json_to_docs | ||||
| from spacy.lang.en import English | ||||
| from spacy.tokens import Doc, DocBin | ||||
|  | @ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc): | |||
|     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.skip("Outdated") | ||||
| @pytest.mark.parametrize( | ||||
|     "tokens_a,tokens_b,expected", | ||||
|     [ | ||||
|         (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), | ||||
|         (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])), | ||||
|         ( | ||||
|             ["a", "b", '"', "c"], | ||||
|             ['ab"', "c"], | ||||
|             (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), | ||||
|             ([[0], [0], [0], [1]], [[0, 1, 2], [3]]), | ||||
|         ), | ||||
|         (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), | ||||
|         (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])), | ||||
|         ( | ||||
|             ["ab", "c", "d"], | ||||
|             ["a", "b", "cd"], | ||||
|             (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), | ||||
|             ([[0, 1], [2], [2]], [[0], [0], [1, 2]]), | ||||
|         ), | ||||
|         ( | ||||
|             ["a", "b", "cd"], | ||||
|             ["a", "b", "c", "d"], | ||||
|             (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), | ||||
|             ([[0], [1], [2, 3]], [[0], [1], [2], [2]]), | ||||
|         ), | ||||
|         ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), | ||||
|         ([" ", "a"], ["a"], ([[], [0]], [[1]])), | ||||
|     ], | ||||
| ) | ||||
| def test_align(tokens_a, tokens_b, expected):  # noqa | ||||
|     cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa | ||||
|     assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa | ||||
|     a2b, b2a = get_alignments(tokens_a, tokens_b) | ||||
|     assert (a2b, b2a) == expected  # noqa | ||||
|     # check symmetry | ||||
|     cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa | ||||
|     assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa | ||||
|     a2b, b2a = get_alignments(tokens_b, tokens_a)  # noqa | ||||
|     assert (b2a, a2b) == expected  # noqa | ||||
| 
 | ||||
| 
 | ||||
| def test_goldparse_startswith_space(en_tokenizer): | ||||
|  | @ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer): | |||
|     assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] | ||||
| 
 | ||||
| 
 | ||||
| def test_goldparse_endswith_space(en_tokenizer): | ||||
|     text = "a\n" | ||||
|     doc = en_tokenizer(text) | ||||
|     gold_words = ["a"] | ||||
|     entities = ["U-DATE"] | ||||
|     deps = ["ROOT"] | ||||
|     heads = [0] | ||||
|     example = Example.from_dict( | ||||
|         doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} | ||||
|     ) | ||||
|     ner_tags = example.get_aligned_ner() | ||||
|     assert ner_tags == ["U-DATE", "O"] | ||||
|     assert example.get_aligned("DEP", as_string=True) == ["ROOT", None] | ||||
| 
 | ||||
| 
 | ||||
| def test_gold_constructor(): | ||||
|     """Test that the Example constructor works fine""" | ||||
|     nlp = English() | ||||
|  | @ -676,6 +691,87 @@ def test_alignment_different_texts(): | |||
|         Alignment.from_strings(other_tokens, spacy_tokens) | ||||
| 
 | ||||
| 
 | ||||
| def test_alignment_spaces(en_vocab): | ||||
|     # single leading whitespace | ||||
|     other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||
|     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] | ||||
|     assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] | ||||
| 
 | ||||
|     # multiple leading whitespace tokens | ||||
|     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||
|     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] | ||||
|     assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] | ||||
| 
 | ||||
|     # both with leading whitespace, not identical | ||||
|     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||
|     spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2] | ||||
|     assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7] | ||||
| 
 | ||||
|     # same leading whitespace, different tokenization | ||||
|     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||
|     spacy_tokens = ["  ", "i", "listened", "to", "obama", "'s", "podcasts."] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6] | ||||
|     assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2] | ||||
|     assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7] | ||||
| 
 | ||||
|     # only one with trailing whitespace | ||||
|     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "] | ||||
|     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] | ||||
|     assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] | ||||
| 
 | ||||
|     # different trailing whitespace | ||||
|     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] | ||||
|     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1] | ||||
|     assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6] | ||||
| 
 | ||||
|     # same trailing whitespace, different tokenization | ||||
|     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] | ||||
|     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", "  "] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1] | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6] | ||||
|     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2] | ||||
|     assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7] | ||||
| 
 | ||||
|     # differing whitespace is allowed | ||||
|     other_tokens = ["a", " \n ", "b", "c"] | ||||
|     spacy_tokens = ["a", "b", " ", "c"] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
|     assert list(align.x2y.dataXd) == [0, 1, 3] | ||||
|     assert list(align.y2x.dataXd) == [0, 2, 3] | ||||
| 
 | ||||
|     # other differences in whitespace are allowed | ||||
|     other_tokens = [" ", "a"] | ||||
|     spacy_tokens = ["  ", "a", " "] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
| 
 | ||||
|     other_tokens = ["a", " "] | ||||
|     spacy_tokens = ["a", "  "] | ||||
|     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||
| 
 | ||||
| 
 | ||||
| def test_retokenized_docs(doc): | ||||
|     a = doc.to_array(["TAG"]) | ||||
|     doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) | ||||
|  |  | |||
|  | @ -399,14 +399,13 @@ cdef class Doc: | |||
|             return True | ||||
|         cdef int i | ||||
|         cdef int range_start = 0 | ||||
|         if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: | ||||
|             attr = SENT_START | ||||
|         attr = intify_attr(attr) | ||||
|         # adjust attributes | ||||
|         if attr == HEAD: | ||||
|             # HEAD does not have an unset state, so rely on DEP | ||||
|             attr = DEP | ||||
|         elif attr == self.vocab.strings["IS_SENT_START"]: | ||||
|             # as in Matcher, allow IS_SENT_START as an alias of SENT_START | ||||
|             attr = SENT_START | ||||
|         # special cases for sentence boundaries | ||||
|         if attr == SENT_START: | ||||
|             if "sents" in self.user_hooks: | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from .corpus import Corpus  # noqa: F401 | ||||
| from .example import Example, validate_examples, validate_get_examples  # noqa: F401 | ||||
| from .align import Alignment  # noqa: F401 | ||||
| from .alignment import Alignment  # noqa: F401 | ||||
| from .augment import dont_augment, orth_variants_augmenter  # noqa: F401 | ||||
| from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401 | ||||
| from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401 | ||||
|  |  | |||
							
								
								
									
										66
									
								
								spacy/training/align.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								spacy/training/align.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,66 @@ | |||
| from typing import List, Tuple | ||||
| from itertools import chain | ||||
| import re | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: | ||||
|     # Create character-to-token mappings | ||||
|     char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) | ||||
|     char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) | ||||
|     str_a = "".join(A).lower() | ||||
|     str_b = "".join(B).lower() | ||||
|     cdef int len_str_a = len(str_a) | ||||
|     cdef int len_str_b = len(str_b) | ||||
|     # Check that the two texts only differ in whitespace and capitalization | ||||
|     if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \ | ||||
|             len_str_a != len(char_to_token_a) or \ | ||||
|             len_str_b != len(char_to_token_b): | ||||
|         raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) | ||||
|     cdef int char_idx_a = 0 | ||||
|     cdef int char_idx_b = 0 | ||||
|     cdef int token_idx_a = 0 | ||||
|     cdef int token_idx_b = 0 | ||||
|     cdef int prev_token_idx_a = -1 | ||||
|     cdef int prev_token_idx_b = -1 | ||||
|     a2b = [] | ||||
|     b2a = [] | ||||
|     while char_idx_a < len_str_a and char_idx_b < len_str_b: | ||||
|         # Find the current token position from the character position | ||||
|         token_idx_a = char_to_token_a[char_idx_a] | ||||
|         token_idx_b = char_to_token_b[char_idx_b] | ||||
|         # Add a set for the next token if a token boundary has been crossed | ||||
|         if prev_token_idx_a != token_idx_a: | ||||
|             a2b.append(set()) | ||||
|         if prev_token_idx_b != token_idx_b: | ||||
|             b2a.append(set()) | ||||
|         # Process the alignment at the current position | ||||
|         if A[token_idx_a] == B[token_idx_b]: | ||||
|             # Current tokens are identical | ||||
|             a2b[-1].add(token_idx_b) | ||||
|             b2a[-1].add(token_idx_a) | ||||
|             char_idx_a += len(A[token_idx_a]) | ||||
|             char_idx_b += len(B[token_idx_b]) | ||||
|         elif str_a[char_idx_a] == str_b[char_idx_b]: | ||||
|             # Current chars are identical | ||||
|             a2b[-1].add(token_idx_b) | ||||
|             b2a[-1].add(token_idx_a) | ||||
|             char_idx_a += 1 | ||||
|             char_idx_b += 1 | ||||
|         elif str_a[char_idx_a].isspace(): | ||||
|             # Skip unaligned whitespace char in A | ||||
|             char_idx_a += 1 | ||||
|         elif str_b[char_idx_b].isspace(): | ||||
|             # Skip unaligned whitespace char in B | ||||
|             char_idx_b += 1 | ||||
|         else: | ||||
|             # This should never happen | ||||
|             raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) | ||||
|         prev_token_idx_a = token_idx_a | ||||
|         prev_token_idx_b = token_idx_b | ||||
|     # Process unaligned trailing whitespace | ||||
|     a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:]))) | ||||
|     b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:]))) | ||||
|     # Return values as sorted lists per token position | ||||
|     return [sorted(x) for x in a2b], [sorted(x) for x in b2a] | ||||
|  | @ -2,9 +2,8 @@ from typing import List | |||
| import numpy | ||||
| from thinc.types import Ragged | ||||
| from dataclasses import dataclass | ||||
| import tokenizations | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from .align import get_alignments | ||||
| 
 | ||||
| 
 | ||||
| @dataclass | ||||
|  | @ -20,9 +19,7 @@ class Alignment: | |||
| 
 | ||||
|     @classmethod | ||||
|     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": | ||||
|         if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): | ||||
|             raise ValueError(Errors.E949) | ||||
|         x2y, y2x = tokenizations.get_alignments(A, B) | ||||
|         x2y, y2x = get_alignments(A, B) | ||||
|         return Alignment.from_indices(x2y=x2y, y2x=y2x) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -7,7 +7,7 @@ from ..tokens.doc cimport Doc | |||
| from ..tokens.span cimport Span | ||||
| from ..tokens.span import Span | ||||
| from ..attrs import IDS | ||||
| from .align import Alignment | ||||
| from .alignment import Alignment | ||||
| from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags | ||||
| from .iob_utils import biluo_tags_to_spans | ||||
| from ..errors import Errors, Warnings | ||||
|  |  | |||
|  | @ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | |||
|     # Resolve all training-relevant sections using the filled nlp config | ||||
|     T = registry.resolve(config["training"], schema=ConfigSchemaTraining) | ||||
|     dot_names = [T["train_corpus"], T["dev_corpus"]] | ||||
|     if not isinstance(T["train_corpus"], str): | ||||
|         raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"]))) | ||||
|     if not isinstance(T["dev_corpus"], str): | ||||
|         raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"]))) | ||||
|     train_corpus, dev_corpus = resolve_dot_names(config, dot_names) | ||||
|     optimizer = T["optimizer"] | ||||
|     # Components that shouldn't be updated during training | ||||
|  |  | |||
|  | @ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model | |||
| from ..ml.models.multi_task import build_cloze_characters_multi_task_model | ||||
| from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain | ||||
| from ..errors import Errors | ||||
| from ..util import registry, load_model_from_config, dot_to_object | ||||
| from ..util import registry, load_model_from_config, resolve_dot_names | ||||
| 
 | ||||
| 
 | ||||
| def pretrain( | ||||
|  | @ -38,7 +38,7 @@ def pretrain( | |||
|     _config = nlp.config.interpolate() | ||||
|     T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) | ||||
|     P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) | ||||
|     corpus = dot_to_object(T, P["corpus"]) | ||||
|     corpus = resolve_dot_names(_config, [P["corpus"]])[0] | ||||
|     batcher = P["batcher"] | ||||
|     model = create_pretraining_model(nlp, P) | ||||
|     optimizer = P["optimizer"] | ||||
|  |  | |||
|  | @ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline. | |||
| 
 | ||||
| Construct an embedding layer that separately embeds a number of lexical | ||||
| attributes using hash embedding, concatenates the results, and passes it through | ||||
| a feed-forward subnetwork to build a mixed representations. The features used | ||||
| a feed-forward subnetwork to build a mixed representation. The features used | ||||
| can be configured with the `attrs` argument. The suggested attributes are | ||||
| `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account | ||||
| some subword information, without construction a fully character-based | ||||
|  | @ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with | |||
| different architectures and settings to determine what works best on your | ||||
| specific data and challenge. | ||||
| 
 | ||||
| ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} | ||||
| ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble} | ||||
| 
 | ||||
| > #### Example Config | ||||
| > | ||||
| > ```ini | ||||
| > [model] | ||||
| > @architectures = "spacy.TextCatEnsemble.v1" | ||||
| > exclusive_classes = false | ||||
| > pretrained_vectors = null | ||||
| > width = 64 | ||||
| > embed_size = 2000 | ||||
| > conv_depth = 2 | ||||
| > window_size = 1 | ||||
| > ngram_size = 1 | ||||
| > dropout = null | ||||
| > @architectures = "spacy.TextCatEnsemble.v2" | ||||
| > nO = null | ||||
| > | ||||
| > [model.linear_model] | ||||
| > @architectures = "spacy.TextCatBOW.v1" | ||||
| > exclusive_classes = true | ||||
| > ngram_size = 1 | ||||
| > no_output_layer = false | ||||
| > | ||||
| > [model.tok2vec] | ||||
| > @architectures = "spacy.Tok2Vec.v1" | ||||
| > | ||||
| > [model.tok2vec.embed] | ||||
| > @architectures = "spacy.MultiHashEmbed.v1" | ||||
| > width = 64 | ||||
| > rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||
| > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||
| > include_static_vectors = false | ||||
| > | ||||
| > [model.tok2vec.encode] | ||||
| > @architectures = "spacy.MaxoutWindowEncoder.v1" | ||||
| > width = ${model.tok2vec.embed.width} | ||||
| > window_size = 1 | ||||
| > maxout_pieces = 3 | ||||
| > depth = 2 | ||||
| > ``` | ||||
| 
 | ||||
| Stacked ensemble of a bag-of-words model and a neural network model. The neural | ||||
| network has an internal CNN Tok2Vec layer and uses attention. | ||||
| Stacked ensemble of a linear bag-of-words model and a neural network model. The | ||||
| neural network is built upon a Tok2Vec layer and uses attention. The setting for | ||||
| whether or not this model should cater for multi-label classification, is taken | ||||
| from the linear model, where it is stored in `model.attrs["multi_label"]`. | ||||
| 
 | ||||
| | Name           | Description                                                                                                                                                                                    | | ||||
| | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~                                                                                                                                  | | ||||
| | `tok2vec`      | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     | | ||||
| | `nO`           | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**    | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| <Accordion title="spacy.TextCatEnsemble.v1 definition" spaced> | ||||
| 
 | ||||
| The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument. | ||||
| 
 | ||||
| | Name                 | Description                                                                                                                                                                                    | | ||||
| | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
|  | @ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention. | |||
| | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||
| 
 | ||||
| </Accordion> | ||||
| 
 | ||||
| ### spacy.TextCatCNN.v1 {#TextCatCNN} | ||||
| 
 | ||||
| > #### Example Config | ||||
|  |  | |||
|  | @ -683,6 +683,7 @@ The L2 norm of the document's vector representation. | |||
| | `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   | | ||||
| | `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           | | ||||
| | `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            | | ||||
| | `has_unknown_spaces`                 | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~         | | ||||
| | `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               | | ||||
| 
 | ||||
| ## Serialization fields {#serialization-fields} | ||||
|  |  | |||
|  | @ -68,6 +68,8 @@ Scores the tokenization: | |||
| - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token | ||||
|   character spans | ||||
| 
 | ||||
| Docs with `has_unknown_spaces` are skipped during scoring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
|  | @ -81,7 +83,8 @@ Scores the tokenization: | |||
| 
 | ||||
| ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} | ||||
| 
 | ||||
| Scores a single token attribute. | ||||
| Scores a single token attribute. Tokens with missing values in the reference doc | ||||
| are skipped during scoring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -90,20 +93,22 @@ Scores a single token attribute. | |||
| > print(scores["pos_acc"]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name           | Description                                                                                                                                                   | | ||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||
| | `attr`         | The attribute to score. ~~str~~                                                                                                                               | | ||||
| | _keyword-only_ |                                                                                                                                                               | | ||||
| | `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||
| | **RETURNS**    | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          | | ||||
| | Name             | Description                                                                                                                                                   | | ||||
| | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||
| | `attr`           | The attribute to score. ~~str~~                                                                                                                               | | ||||
| | _keyword-only_   |                                                                                                                                                               | | ||||
| | `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||
| | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        | | ||||
| | **RETURNS**      | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          | | ||||
| 
 | ||||
| ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} | ||||
| 
 | ||||
| Scores a single token attribute per feature for a token attribute in the | ||||
| Universal Dependencies | ||||
| [FEATS](https://universaldependencies.org/format.html#morphological-annotation) | ||||
| format. | ||||
| format. Tokens with missing values in the reference doc are skipped during | ||||
| scoring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -112,13 +117,14 @@ format. | |||
| > print(scores["morph_per_feat"]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name           | Description                                                                                                                                                   | | ||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||
| | `attr`         | The attribute to score. ~~str~~                                                                                                                               | | ||||
| | _keyword-only_ |                                                                                                                                                               | | ||||
| | `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||
| | **RETURNS**    | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           | | ||||
| | Name             | Description                                                                                                                                                   | | ||||
| | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||
| | `attr`           | The attribute to score. ~~str~~                                                                                                                               | | ||||
| | _keyword-only_   |                                                                                                                                                               | | ||||
| | `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||
| | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        | | ||||
| | **RETURNS**      | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           | | ||||
| 
 | ||||
| ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} | ||||
| 
 | ||||
|  | @ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans. | |||
| > print(scores["ents_f"]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name           | Description                                                                                                                                                                                 | | ||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         | | ||||
| | `attr`         | The attribute to score. ~~str~~                                                                                                                                                             | | ||||
| | _keyword-only_ |                                                                                                                                                                                             | | ||||
| | `getter`       | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  | | ||||
| | **RETURNS**    | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | ||||
| | Name             | Description                                                                                                                                                                                 | | ||||
| | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         | | ||||
| | `attr`           | The attribute to score. ~~str~~                                                                                                                                                             | | ||||
| | _keyword-only_   |                                                                                                                                                                                             | | ||||
| | `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  | | ||||
| | `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~      | | ||||
| | **RETURNS**      | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | ||||
| 
 | ||||
| ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} | ||||
| 
 | ||||
| Calculate the UAS, LAS, and LAS per type scores for dependency parses. | ||||
| Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens | ||||
| with missing values for the `attr` (typically `dep`) are skipped during scoring. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. | |||
| > print(scores["dep_uas"], scores["dep_las"]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name            | Description                                                                                                                                                   | | ||||
| | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`      | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||
| | `attr`          | The attribute to score. ~~str~~                                                                                                                               | | ||||
| | _keyword-only_  |                                                                                                                                                               | | ||||
| | `getter`        | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||
| | `head_attr`     | The attribute containing the head token. ~~str~~                                                                                                              | | ||||
| | `head_getter`   | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              | | ||||
| | `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            | | ||||
| | **RETURNS**     | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      | | ||||
| | Name             | Description                                                                                                                                                   | | ||||
| | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||
| | `attr`           | The attribute to score. ~~str~~                                                                                                                               | | ||||
| | _keyword-only_   |                                                                                                                                                               | | ||||
| | `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||
| | `head_attr`      | The attribute containing the head token. ~~str~~                                                                                                              | | ||||
| | `head_getter`    | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              | | ||||
| | `ignore_labels`  | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            | | ||||
| | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        | | ||||
| | **RETURNS**      | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      | | ||||
| 
 | ||||
| ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} | ||||
| 
 | ||||
| Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict | ||||
| containing scores for each label like `Doc.cats`. The reported overall score | ||||
| depends on the scorer settings: | ||||
| containing scores for each label like `Doc.cats`. The returned dictionary | ||||
| contains the following scores: | ||||
| 
 | ||||
| 1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / | ||||
|    `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall | ||||
|    score), `{attr}_f_per_type`, `{attr}_auc_per_type` | ||||
| 2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` | ||||
| 3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; | ||||
| 4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` | ||||
| - `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across | ||||
|   each label is weighted equally | ||||
| - `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values | ||||
|   across evaluations per label | ||||
| - `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of | ||||
|   scores, keyed by label | ||||
| - A final `{attr}_score` and corresponding `{attr}_score_desc` (text | ||||
|   description) | ||||
| 
 | ||||
| The reported `{attr}_score` depends on the classification properties: | ||||
| 
 | ||||
| - **binary exclusive with positive label:** `{attr}_score` is set to the F-score | ||||
|   of the positive label | ||||
| - **3+ exclusive classes**, macro-averaged F-score: | ||||
|   `{attr}_score = {attr}_macro_f` | ||||
| - **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc` | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  |  | |||
|  | @ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers)) | |||
| 
 | ||||
| Computing similarity scores can be helpful in many situations, but it's also | ||||
| important to maintain **realistic expectations** about what information it can | ||||
| provide. Words can be related to each over in many ways, so a single | ||||
| provide. Words can be related to each other in many ways, so a single | ||||
| "similarity" score will always be a **mix of different signals**, and vectors | ||||
| trained on different data can produce very different results that may not be | ||||
| useful for your purpose. Here are some important considerations to keep in mind: | ||||
|  |  | |||
|  | @ -130,16 +130,31 @@ factory = "textcat" | |||
| labels = [] | ||||
| 
 | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatEnsemble.v1" | ||||
| exclusive_classes = false | ||||
| pretrained_vectors = null | ||||
| width = 64 | ||||
| conv_depth = 2 | ||||
| embed_size = 2000 | ||||
| window_size = 1 | ||||
| ngram_size = 1 | ||||
| dropout = 0 | ||||
| @architectures = "spacy.TextCatEnsemble.v2" | ||||
| nO = null | ||||
| 
 | ||||
| [components.textcat.model.tok2vec] | ||||
| @architectures = "spacy.Tok2Vec.v1" | ||||
| 
 | ||||
| [components.textcat.model.tok2vec.embed] | ||||
| @architectures = "spacy.MultiHashEmbed.v1" | ||||
| width = 64 | ||||
| rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||
| attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||
| include_static_vectors = false | ||||
| 
 | ||||
| [components.textcat.model.tok2vec.encode] | ||||
| @architectures = "spacy.MaxoutWindowEncoder.v1" | ||||
| width = ${components.textcat.model.tok2vec.embed.width} | ||||
| window_size = 1 | ||||
| maxout_pieces = 3 | ||||
| depth = 2 | ||||
| 
 | ||||
| [components.textcat.model.linear_model] | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
| exclusive_classes = false | ||||
| ngram_size = 1 | ||||
| no_output_layer = false | ||||
| ``` | ||||
| 
 | ||||
| spaCy has two additional built-in `textcat` architectures, and you can easily | ||||
|  | @ -687,7 +702,7 @@ Before the model can be used, it needs to be | |||
| [initialized](/usage/training#initialization). This function receives a callback | ||||
| to access the full **training data set**, or a representative sample. This data | ||||
| set can be used to deduce all **relevant labels**. Alternatively, a list of | ||||
| labels can be provided to `initialize`, or you can call  | ||||
| labels can be provided to `initialize`, or you can call | ||||
| `RelationExtractor.add_label` directly. The number of labels defines the output | ||||
| dimensionality of the network, and will be used to do | ||||
| [shape inference](https://thinc.ai/docs/usage-models#validation) throughout the | ||||
|  |  | |||
|  | @ -1244,15 +1244,10 @@ labels = [] | |||
| # This function is created and then passed to the "textcat" component as | ||||
| # the argument "model" | ||||
| [components.textcat.model] | ||||
| @architectures = "spacy.TextCatEnsemble.v1" | ||||
| @architectures = "spacy.TextCatBOW.v1" | ||||
| exclusive_classes = false | ||||
| pretrained_vectors = null | ||||
| width = 64 | ||||
| conv_depth = 2 | ||||
| embed_size = 2000 | ||||
| window_size = 1 | ||||
| ngram_size = 1 | ||||
| dropout = null | ||||
| no_output_layer = false | ||||
| 
 | ||||
| [components.other_textcat] | ||||
| factory = "textcat" | ||||
|  |  | |||
|  | @ -1142,7 +1142,7 @@ pattern = [ | |||
|     { | ||||
|         "LEFT_ID": "anchor_founded", | ||||
|         "REL_OP": ">", | ||||
|         "RIGHT_ID": "subject", | ||||
|         "RIGHT_ID": "founded_subject", | ||||
|         "RIGHT_ATTRS": {"DEP": "nsubj"}, | ||||
|     } | ||||
|     # ... | ||||
|  | @ -1212,7 +1212,7 @@ pattern = [ | |||
|     { | ||||
|         "LEFT_ID": "anchor_founded", | ||||
|         "REL_OP": ">", | ||||
|         "RIGHT_ID": "subject", | ||||
|         "RIGHT_ID": "founded_subject", | ||||
|         "RIGHT_ATTRS": {"DEP": "nsubj"}, | ||||
|     }, | ||||
|     { | ||||
|  |  | |||
|  | @ -717,7 +717,7 @@ tabular results to a file: | |||
| ```python | ||||
| ### functions.py | ||||
| import sys | ||||
| from typing import IO, Tuple, Callable, Dict, Any | ||||
| from typing import IO, Tuple, Callable, Dict, Any, Optional | ||||
| import spacy | ||||
| from spacy import Language | ||||
| from pathlib import Path | ||||
|  | @ -729,7 +729,7 @@ def custom_logger(log_path): | |||
|         stdout: IO=sys.stdout, | ||||
|         stderr: IO=sys.stderr | ||||
|     ) -> Tuple[Callable, Callable]: | ||||
|         stdout.write(f"Logging to {log_path}\n") | ||||
|         stdout.write(f"Logging to {log_path}\\n") | ||||
|         log_file = Path(log_path).open("w", encoding="utf8") | ||||
|         log_file.write("step\\t") | ||||
|         log_file.write("score\\t") | ||||
|  |  | |||
|  | @ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0. | |||
| | Name                                                                                                                            | Description                                                                                                                                                                                      | | ||||
| | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        | | ||||
| | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         | | ||||
| | [`Token.morph`](/api/token#attributes)                                                                                          | Access a token's morphological analysis.                                                                                                                                                         | | ||||
| | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         | | ||||
| | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              | | ||||
| | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             | | ||||
| | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          | | ||||
| | [`Language.resume_training`](/api/language#resume_training)                                                                     | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting.                              | | ||||
| | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component)                                  | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions.                                               | | ||||
| | [`Language.has_factory`](/api/language#has_factory)                                                                             | Check whether a component factory is registered on a language class.                                                                                                                            | | ||||
| | [`Language.has_factory`](/api/language#has_factory)                                                                             | Check whether a component factory is registered on a language class.                                                                                                                             | | ||||
| | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta)       | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name.                                                                                       | | ||||
| | [`Language.config`](/api/language#config)                                                                                       | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | ||||
| | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes)                       | All available components and component names, including disabled components that are not run as part of the pipeline.                                                                            | | ||||
|  | @ -1032,9 +1032,9 @@ change your names and imports: | |||
| Thanks to everyone who's been contributing to the spaCy ecosystem by developing | ||||
| and maintaining one of the many awesome [plugins and extensions](/universe). | ||||
| We've tried to make it as easy as possible for you to upgrade your packages for | ||||
| spaCy v3.0. The most common use case for plugins is providing pipeline components | ||||
| and extension attributes. When migrating your plugin, double-check the | ||||
| following: | ||||
| spaCy v3.0. The most common use case for plugins is providing pipeline | ||||
| components and extension attributes. When migrating your plugin, double-check | ||||
| the following: | ||||
| 
 | ||||
| - Use the [`@Language.factory`](/api/language#factory) decorator to register | ||||
|   your component and assign it a name. This allows users to refer to your | ||||
|  |  | |||
|  | @ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg) | |||
| Since each visualization is generated as a separate SVG, exporting `.svg` files | ||||
| only works if you're rendering **one single doc** at a time. (This makes sense – | ||||
| after all, each visualization should be a standalone graphic.) So instead of | ||||
| rendering all `Doc`s at one, loop over them and export them separately. | ||||
| rendering all `Doc`s at once, loop over them and export them separately. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
|  |  | |||
|  | @ -120,7 +120,7 @@ function formatAccuracy(data) { | |||
|                 ? null | ||||
|                 : { | ||||
|                       label, | ||||
|                       value: value.toFixed(2), | ||||
|                       value: (value * 100).toFixed(2), | ||||
|                       help: MODEL_META[label], | ||||
|                   } | ||||
|         }) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user