mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
		
						commit
						1075b7ebb7
					
				|  | @ -8,7 +8,6 @@ requires = [ | ||||||
|     "murmurhash>=0.28.0,<1.1.0", |     "murmurhash>=0.28.0,<1.1.0", | ||||||
|     "thinc>=8.0.0rc0,<8.1.0", |     "thinc>=8.0.0rc0,<8.1.0", | ||||||
|     "blis>=0.4.0,<0.8.0", |     "blis>=0.4.0,<0.8.0", | ||||||
|     "pytokenizations", |  | ||||||
|     "pathy" |     "pathy" | ||||||
| ] | ] | ||||||
| build-backend = "setuptools.build_meta" | build-backend = "setuptools.build_meta" | ||||||
|  |  | ||||||
|  | @ -14,8 +14,7 @@ pathy | ||||||
| numpy>=1.15.0 | numpy>=1.15.0 | ||||||
| requests>=2.13.0,<3.0.0 | requests>=2.13.0,<3.0.0 | ||||||
| tqdm>=4.38.0,<5.0.0 | tqdm>=4.38.0,<5.0.0 | ||||||
| pydantic>=1.5.0,<2.0.0 | pydantic>=1.5.0,<1.7.0 | ||||||
| pytokenizations |  | ||||||
| # Official Python utilities | # Official Python utilities | ||||||
| setuptools | setuptools | ||||||
| packaging>=20.0 | packaging>=20.0 | ||||||
|  |  | ||||||
|  | @ -51,8 +51,8 @@ install_requires = | ||||||
|     tqdm>=4.38.0,<5.0.0 |     tqdm>=4.38.0,<5.0.0 | ||||||
|     numpy>=1.15.0 |     numpy>=1.15.0 | ||||||
|     requests>=2.13.0,<3.0.0 |     requests>=2.13.0,<3.0.0 | ||||||
|     pydantic>=1.5.0,<2.0.0 |     pydantic>=1.5.0,<1.7.0 | ||||||
|     pytokenizations |     jinja2 | ||||||
|     # Official Python utilities |     # Official Python utilities | ||||||
|     setuptools |     setuptools | ||||||
|     packaging>=20.0 |     packaging>=20.0 | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -49,6 +49,7 @@ MOD_NAMES = [ | ||||||
|     "spacy.pipeline._parser_internals.stateclass", |     "spacy.pipeline._parser_internals.stateclass", | ||||||
|     "spacy.pipeline._parser_internals.transition_system", |     "spacy.pipeline._parser_internals.transition_system", | ||||||
|     "spacy.tokenizer", |     "spacy.tokenizer", | ||||||
|  |     "spacy.training.align", | ||||||
|     "spacy.training.gold_io", |     "spacy.training.gold_io", | ||||||
|     "spacy.tokens.doc", |     "spacy.tokens.doc", | ||||||
|     "spacy.tokens.span", |     "spacy.tokens.span", | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| # fmt: off | # fmt: off | ||||||
| __title__ = "spacy-nightly" | __title__ = "spacy-nightly" | ||||||
| __version__ = "3.0.0rc1" | __version__ = "3.0.0rc2" | ||||||
| __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | __download_url__ = "https://github.com/explosion/spacy-models/releases/download" | ||||||
| __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" | ||||||
| __projects__ = "https://github.com/explosion/projects" | __projects__ = "https://github.com/explosion/projects" | ||||||
|  |  | ||||||
|  | @ -93,27 +93,42 @@ def evaluate( | ||||||
|         "SPEED": "speed", |         "SPEED": "speed", | ||||||
|     } |     } | ||||||
|     results = {} |     results = {} | ||||||
|  |     data = {} | ||||||
|     for metric, key in metrics.items(): |     for metric, key in metrics.items(): | ||||||
|         if key in scores: |         if key in scores: | ||||||
|             if key == "cats_score": |             if key == "cats_score": | ||||||
|                 metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" |                 metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" | ||||||
|             if key == "speed": |             if isinstance(scores[key], (int, float)): | ||||||
|                 results[metric] = f"{scores[key]:.0f}" |                 if key == "speed": | ||||||
|  |                     results[metric] = f"{scores[key]:.0f}" | ||||||
|  |                 else: | ||||||
|  |                     results[metric] = f"{scores[key]*100:.2f}" | ||||||
|             else: |             else: | ||||||
|                 results[metric] = f"{scores[key]*100:.2f}" |                 results[metric] = "-" | ||||||
|     data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} |             data[re.sub(r"[\s/]", "_", key.lower())] = scores[key] | ||||||
| 
 | 
 | ||||||
|     msg.table(results, title="Results") |     msg.table(results, title="Results") | ||||||
| 
 | 
 | ||||||
|  |     if "morph_per_feat" in scores: | ||||||
|  |         if scores["morph_per_feat"]: | ||||||
|  |             print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat") | ||||||
|  |             data["morph_per_feat"] = scores["morph_per_feat"] | ||||||
|  |     if "dep_las_per_type" in scores: | ||||||
|  |         if scores["dep_las_per_type"]: | ||||||
|  |             print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type") | ||||||
|  |             data["dep_las_per_type"] = scores["dep_las_per_type"] | ||||||
|     if "ents_per_type" in scores: |     if "ents_per_type" in scores: | ||||||
|         if scores["ents_per_type"]: |         if scores["ents_per_type"]: | ||||||
|             print_ents_per_type(msg, scores["ents_per_type"]) |             print_prf_per_type(msg, scores["ents_per_type"], "NER", "type") | ||||||
|  |             data["ents_per_type"] = scores["ents_per_type"] | ||||||
|     if "cats_f_per_type" in scores: |     if "cats_f_per_type" in scores: | ||||||
|         if scores["cats_f_per_type"]: |         if scores["cats_f_per_type"]: | ||||||
|             print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) |             print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label") | ||||||
|  |             data["cats_f_per_type"] = scores["cats_f_per_type"] | ||||||
|     if "cats_auc_per_type" in scores: |     if "cats_auc_per_type" in scores: | ||||||
|         if scores["cats_auc_per_type"]: |         if scores["cats_auc_per_type"]: | ||||||
|             print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) |             print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) | ||||||
|  |             data["cats_auc_per_type"] = scores["cats_auc_per_type"] | ||||||
| 
 | 
 | ||||||
|     if displacy_path: |     if displacy_path: | ||||||
|         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] |         factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] | ||||||
|  | @ -157,7 +172,7 @@ def render_parses( | ||||||
|             file_.write(html) |             file_.write(html) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: | def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]], name: str, type: str) -> None: | ||||||
|     data = [ |     data = [ | ||||||
|         (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") |         (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") | ||||||
|         for k, v in scores.items() |         for k, v in scores.items() | ||||||
|  | @ -166,20 +181,7 @@ def print_ents_per_type(msg: Printer, scores: Dict[str, Dict[str, float]]) -> No | ||||||
|         data, |         data, | ||||||
|         header=("", "P", "R", "F"), |         header=("", "P", "R", "F"), | ||||||
|         aligns=("l", "r", "r", "r"), |         aligns=("l", "r", "r", "r"), | ||||||
|         title="NER (per type)", |         title=f"{name} (per {type})", | ||||||
|     ) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) -> None: |  | ||||||
|     data = [ |  | ||||||
|         (k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}") |  | ||||||
|         for k, v in scores.items() |  | ||||||
|     ] |  | ||||||
|     msg.table( |  | ||||||
|         data, |  | ||||||
|         header=("", "P", "R", "F"), |  | ||||||
|         aligns=("l", "r", "r", "r"), |  | ||||||
|         title="Textcat F (per label)", |  | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -39,7 +39,7 @@ def init_vectors_cli( | ||||||
|     nlp.to_disk(output_dir) |     nlp.to_disk(output_dir) | ||||||
|     msg.good( |     msg.good( | ||||||
|         "Saved nlp object with vectors to output directory. You can now use the " |         "Saved nlp object with vectors to output directory. You can now use the " | ||||||
|         "path to it in your config as the 'vectors' setting in [initialize.vocab].", |         "path to it in your config as the 'vectors' setting in [initialize].", | ||||||
|         output_dir.resolve(), |         output_dir.resolve(), | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  | @ -100,7 +100,7 @@ def init_labels_cli( | ||||||
|     extract the labels.""" |     extract the labels.""" | ||||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) |     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||||
|     if not output_path.exists(): |     if not output_path.exists(): | ||||||
|         output_path.mkdir() |         output_path.mkdir(parents=True) | ||||||
|     overrides = parse_config_overrides(ctx.args) |     overrides = parse_config_overrides(ctx.args) | ||||||
|     import_code(code_path) |     import_code(code_path) | ||||||
|     setup_gpu(use_gpu) |     setup_gpu(use_gpu) | ||||||
|  |  | ||||||
|  | @ -136,15 +136,19 @@ factory = "textcat" | ||||||
| 
 | 
 | ||||||
| {% if optimize == "accuracy" %} | {% if optimize == "accuracy" %} | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatEnsemble.v1" | @architectures = "spacy.TextCatEnsemble.v2" | ||||||
| exclusive_classes = false |  | ||||||
| width = 64 |  | ||||||
| conv_depth = 2 |  | ||||||
| embed_size = 2000 |  | ||||||
| window_size = 1 |  | ||||||
| ngram_size = 1 |  | ||||||
| nO = null | nO = null | ||||||
| 
 | 
 | ||||||
|  | [components.textcat.model.tok2vec] | ||||||
|  | @architectures = "spacy-transformers.TransformerListener.v1" | ||||||
|  | grad_factor = 1.0 | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.linear_model] | ||||||
|  | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | ngram_size = 1 | ||||||
|  | no_output_layer = false | ||||||
|  | 
 | ||||||
| {% else -%} | {% else -%} | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatBOW.v1" | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | @ -271,15 +275,19 @@ factory = "textcat" | ||||||
| 
 | 
 | ||||||
| {% if optimize == "accuracy" %} | {% if optimize == "accuracy" %} | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatEnsemble.v1" | @architectures = "spacy.TextCatEnsemble.v2" | ||||||
| exclusive_classes = false |  | ||||||
| width = 64 |  | ||||||
| conv_depth = 2 |  | ||||||
| embed_size = 2000 |  | ||||||
| window_size = 1 |  | ||||||
| ngram_size = 1 |  | ||||||
| nO = null | nO = null | ||||||
| 
 | 
 | ||||||
|  | [components.textcat.model.tok2vec] | ||||||
|  | @architectures = "spacy.Tok2VecListener.v1" | ||||||
|  | width = ${components.tok2vec.model.encode.width} | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.linear_model] | ||||||
|  | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | ngram_size = 1 | ||||||
|  | no_output_layer = false | ||||||
|  | 
 | ||||||
| {% else -%} | {% else -%} | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatBOW.v1" | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  |  | ||||||
|  | @ -44,7 +44,7 @@ def train_cli( | ||||||
|     if not config_path or not config_path.exists(): |     if not config_path or not config_path.exists(): | ||||||
|         msg.fail("Config file not found", config_path, exits=1) |         msg.fail("Config file not found", config_path, exits=1) | ||||||
|     if output_path is not None and not output_path.exists(): |     if output_path is not None and not output_path.exists(): | ||||||
|         output_path.mkdir() |         output_path.mkdir(parents=True) | ||||||
|         msg.good(f"Created output directory: {output_path}") |         msg.good(f"Created output directory: {output_path}") | ||||||
|     overrides = parse_config_overrides(ctx.args) |     overrides = parse_config_overrides(ctx.args) | ||||||
|     import_code(code_path) |     import_code(code_path) | ||||||
|  |  | ||||||
|  | @ -398,8 +398,8 @@ class Errors: | ||||||
|     E163 = ("cumsum was found to be unstable: its last element does not " |     E163 = ("cumsum was found to be unstable: its last element does not " | ||||||
|             "correspond to sum") |             "correspond to sum") | ||||||
|     E164 = ("x is neither increasing nor decreasing: {x}.") |     E164 = ("x is neither increasing nor decreasing: {x}.") | ||||||
|     E165 = ("Only one class present in y_true. ROC AUC score is not defined in " |     E165 = ("Only one class present in the gold labels: {label}. " | ||||||
|             "that case.") |             "ROC AUC score is not defined in that case.") | ||||||
|     E166 = ("Can only merge DocBins with the same value for '{param}'.\n" |     E166 = ("Can only merge DocBins with the same value for '{param}'.\n" | ||||||
|             "Current DocBin: {current}\nOther DocBin: {other}") |             "Current DocBin: {current}\nOther DocBin: {other}") | ||||||
|     E169 = ("Can't find module: {module}") |     E169 = ("Can't find module: {module}") | ||||||
|  | @ -456,6 +456,8 @@ class Errors: | ||||||
|             "issue tracker: http://github.com/explosion/spaCy/issues") |             "issue tracker: http://github.com/explosion/spaCy/issues") | ||||||
| 
 | 
 | ||||||
|     # TODO: fix numbering after merging develop into master |     # TODO: fix numbering after merging develop into master | ||||||
|  |     E897 = ("Field '{field}' should be a dot-notation string referring to the " | ||||||
|  |             "relevant section in the config, but found type {type} instead.") | ||||||
|     E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " |     E898 = ("Can't serialize trainable pipe '{name}': the `model` attribute " | ||||||
|             "is not set or None. If you've implemented a custom component, make " |             "is not set or None. If you've implemented a custom component, make " | ||||||
|             "sure to store the component model as `self.model` in your " |             "sure to store the component model as `self.model` in your " | ||||||
|  | @ -562,7 +564,10 @@ class Errors: | ||||||
|             "a string value from {expected} but got: '{arg}'") |             "a string value from {expected} but got: '{arg}'") | ||||||
|     E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " |     E948 = ("`Matcher.add` received invalid 'patterns' argument: expected " | ||||||
|             "a list, but got: {arg_type}") |             "a list, but got: {arg_type}") | ||||||
|     E949 = ("Can only create an alignment when the texts are the same.") |     E949 = ("Unable to align tokens for the predicted and reference docs. It " | ||||||
|  |             "is only possible to align the docs when both texts are the same " | ||||||
|  |             "except for whitespace and capitalization. The predicted tokens " | ||||||
|  |             "start with: {x}. The reference tokens start with: {y}.") | ||||||
|     E952 = ("The section '{name}' is not a valid section in the provided config.") |     E952 = ("The section '{name}' is not a valid section in the provided config.") | ||||||
|     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") |     E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") | ||||||
|     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " |     E954 = ("The Tok2Vec listener did not receive any valid input from an upstream " | ||||||
|  |  | ||||||
|  | @ -286,10 +286,10 @@ cdef class DependencyMatcher: | ||||||
|                 self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) |                 self.recurse(_tree, id_to_position, _node_operator_map, 0, [], matched_trees) | ||||||
|                 for matched_tree in matched_trees: |                 for matched_tree in matched_trees: | ||||||
|                     matched_key_trees.append((key, matched_tree)) |                     matched_key_trees.append((key, matched_tree)) | ||||||
|             for i, (match_id, nodes) in enumerate(matched_key_trees): |         for i, (match_id, nodes) in enumerate(matched_key_trees): | ||||||
|                 on_match = self._callbacks.get(match_id) |             on_match = self._callbacks.get(match_id) | ||||||
|                 if on_match is not None: |             if on_match is not None: | ||||||
|                     on_match(self, doc, i, matched_key_trees) |                 on_match(self, doc, i, matched_key_trees) | ||||||
|         return matched_key_trees |         return matched_key_trees | ||||||
| 
 | 
 | ||||||
|     def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): |     def recurse(self, tree, id_to_position, _node_operator_map, int patternLength, visited_nodes, matched_trees): | ||||||
|  |  | ||||||
|  | @ -1,4 +1,6 @@ | ||||||
| from typing import Optional | from typing import Optional, List | ||||||
|  | 
 | ||||||
|  | from thinc.types import Floats2d | ||||||
| from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic | from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic | ||||||
| from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention | from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention | ||||||
| from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum | from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum | ||||||
|  | @ -10,12 +12,13 @@ from ...util import registry | ||||||
| from ..extract_ngrams import extract_ngrams | from ..extract_ngrams import extract_ngrams | ||||||
| from ..staticvectors import StaticVectors | from ..staticvectors import StaticVectors | ||||||
| from ..featureextractor import FeatureExtractor | from ..featureextractor import FeatureExtractor | ||||||
|  | from ...tokens import Doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @registry.architectures.register("spacy.TextCatCNN.v1") | @registry.architectures.register("spacy.TextCatCNN.v1") | ||||||
| def build_simple_cnn_text_classifier( | def build_simple_cnn_text_classifier( | ||||||
|     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None |     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None | ||||||
| ) -> Model: | ) -> Model[List[Doc], Floats2d]: | ||||||
|     """ |     """ | ||||||
|     Build a simple CNN text classifier, given a token-to-vector model as inputs. |     Build a simple CNN text classifier, given a token-to-vector model as inputs. | ||||||
|     If exclusive_classes=True, a softmax non-linearity is applied, so that the |     If exclusive_classes=True, a softmax non-linearity is applied, so that the | ||||||
|  | @ -23,15 +26,14 @@ def build_simple_cnn_text_classifier( | ||||||
|     is applied instead, so that outputs are in the range [0, 1]. |     is applied instead, so that outputs are in the range [0, 1]. | ||||||
|     """ |     """ | ||||||
|     with Model.define_operators({">>": chain}): |     with Model.define_operators({">>": chain}): | ||||||
|  |         cnn = tok2vec >> list2ragged() >> reduce_mean() | ||||||
|         if exclusive_classes: |         if exclusive_classes: | ||||||
|             output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) |             output_layer = Softmax(nO=nO, nI=tok2vec.maybe_get_dim("nO")) | ||||||
|             model = tok2vec >> list2ragged() >> reduce_mean() >> output_layer |             model = cnn >> output_layer | ||||||
|             model.set_ref("output_layer", output_layer) |             model.set_ref("output_layer", output_layer) | ||||||
|         else: |         else: | ||||||
|             linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) |             linear_layer = Linear(nO=nO, nI=tok2vec.maybe_get_dim("nO")) | ||||||
|             model = ( |             model = cnn >> linear_layer >> Logistic() | ||||||
|                 tok2vec >> list2ragged() >> reduce_mean() >> linear_layer >> Logistic() |  | ||||||
|             ) |  | ||||||
|             model.set_ref("output_layer", linear_layer) |             model.set_ref("output_layer", linear_layer) | ||||||
|     model.set_ref("tok2vec", tok2vec) |     model.set_ref("tok2vec", tok2vec) | ||||||
|     model.set_dim("nO", nO) |     model.set_dim("nO", nO) | ||||||
|  | @ -45,8 +47,7 @@ def build_bow_text_classifier( | ||||||
|     ngram_size: int, |     ngram_size: int, | ||||||
|     no_output_layer: bool, |     no_output_layer: bool, | ||||||
|     nO: Optional[int] = None, |     nO: Optional[int] = None, | ||||||
| ) -> Model: | ) -> Model[List[Doc], Floats2d]: | ||||||
|     # Don't document this yet, I'm not sure it's right. |  | ||||||
|     with Model.define_operators({">>": chain}): |     with Model.define_operators({">>": chain}): | ||||||
|         sparse_linear = SparseLinear(nO) |         sparse_linear = SparseLinear(nO) | ||||||
|         model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear |         model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear | ||||||
|  | @ -59,6 +60,39 @@ def build_bow_text_classifier( | ||||||
|     return model |     return model | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @registry.architectures.register("spacy.TextCatEnsemble.v2") | ||||||
|  | def build_text_classifier( | ||||||
|  |     tok2vec: Model[List[Doc], List[Floats2d]], | ||||||
|  |     linear_model: Model[List[Doc], Floats2d], | ||||||
|  |     nO: Optional[int] = None, | ||||||
|  | ) -> Model[List[Doc], Floats2d]: | ||||||
|  |     exclusive_classes = not linear_model.attrs["multi_label"] | ||||||
|  |     with Model.define_operators({">>": chain, "|": concatenate}): | ||||||
|  |         width = tok2vec.get_dim("nO") | ||||||
|  |         cnn_model = ( | ||||||
|  |                 tok2vec | ||||||
|  |                 >> list2ragged() | ||||||
|  |                 >> ParametricAttention(width)   # TODO: benchmark performance difference of this layer | ||||||
|  |                 >> reduce_sum() | ||||||
|  |                 >> residual(Maxout(nO=width, nI=width)) | ||||||
|  |                 >> Linear(nO=nO, nI=width) | ||||||
|  |                 >> Dropout(0.0) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         nO_double = nO * 2 if nO else None | ||||||
|  |         if exclusive_classes: | ||||||
|  |             output_layer = Softmax(nO=nO, nI=nO_double) | ||||||
|  |         else: | ||||||
|  |             output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() | ||||||
|  |         model = (linear_model | cnn_model) >> output_layer | ||||||
|  |         model.set_ref("tok2vec", tok2vec) | ||||||
|  |     if model.has_dim("nO") is not False: | ||||||
|  |         model.set_dim("nO", nO) | ||||||
|  |     model.set_ref("output_layer", linear_model.get_ref("output_layer")) | ||||||
|  |     model.attrs["multi_label"] = not exclusive_classes | ||||||
|  |     return model | ||||||
|  | 
 | ||||||
|  | # TODO: move to legacy | ||||||
| @registry.architectures.register("spacy.TextCatEnsemble.v1") | @registry.architectures.register("spacy.TextCatEnsemble.v1") | ||||||
| def build_text_classifier( | def build_text_classifier( | ||||||
|     width: int, |     width: int, | ||||||
|  | @ -158,11 +192,8 @@ def build_text_classifier( | ||||||
| 
 | 
 | ||||||
| @registry.architectures.register("spacy.TextCatLowData.v1") | @registry.architectures.register("spacy.TextCatLowData.v1") | ||||||
| def build_text_classifier_lowdata( | def build_text_classifier_lowdata( | ||||||
|     width: int, |     width: int, dropout: Optional[float], nO: Optional[int] = None | ||||||
|     pretrained_vectors: Optional[bool], | ) -> Model[List[Doc], Floats2d]: | ||||||
|     dropout: Optional[float], |  | ||||||
|     nO: Optional[int] = None, |  | ||||||
| ) -> Model: |  | ||||||
|     # Don't document this yet, I'm not sure it's right. |     # Don't document this yet, I'm not sure it's right. | ||||||
|     # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" |     # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" | ||||||
|     with Model.define_operators({">>": chain, "**": clone}): |     with Model.define_operators({">>": chain, "**": clone}): | ||||||
|  |  | ||||||
|  | @ -106,7 +106,7 @@ def MultiHashEmbed( | ||||||
| ) -> Model[List[Doc], List[Floats2d]]: | ) -> Model[List[Doc], List[Floats2d]]: | ||||||
|     """Construct an embedding layer that separately embeds a number of lexical |     """Construct an embedding layer that separately embeds a number of lexical | ||||||
|     attributes using hash embedding, concatenates the results, and passes it |     attributes using hash embedding, concatenates the results, and passes it | ||||||
|     through a feed-forward subnetwork to build a mixed representations. |     through a feed-forward subnetwork to build a mixed representation. | ||||||
| 
 | 
 | ||||||
|     The features used can be configured with the 'attrs' argument. The suggested |     The features used can be configured with the 'attrs' argument. The suggested | ||||||
|     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into |     attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into | ||||||
|  |  | ||||||
|  | @ -226,6 +226,9 @@ class AttributeRuler(Pipe): | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/tagger#score |         DOCS: https://nightly.spacy.io/api/tagger#score | ||||||
|         """ |         """ | ||||||
|  |         def morph_key_getter(token, attr): | ||||||
|  |             return getattr(token, attr).key | ||||||
|  | 
 | ||||||
|         validate_examples(examples, "AttributeRuler.score") |         validate_examples(examples, "AttributeRuler.score") | ||||||
|         results = {} |         results = {} | ||||||
|         attrs = set() |         attrs = set() | ||||||
|  | @ -237,7 +240,8 @@ class AttributeRuler(Pipe): | ||||||
|             elif attr == POS: |             elif attr == POS: | ||||||
|                 results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) |                 results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) | ||||||
|             elif attr == MORPH: |             elif attr == MORPH: | ||||||
|                 results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) |                 results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||||
|  |                 results.update(Scorer.score_token_attr_per_feat(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||||
|             elif attr == LEMMA: |             elif attr == LEMMA: | ||||||
|                 results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) |                 results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) | ||||||
|         return results |         return results | ||||||
|  |  | ||||||
|  | @ -155,13 +155,16 @@ cdef class DependencyParser(Parser): | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/dependencyparser#score |         DOCS: https://nightly.spacy.io/api/dependencyparser#score | ||||||
|         """ |         """ | ||||||
|  |         def has_sents(doc): | ||||||
|  |             return doc.has_annotation("SENT_START") | ||||||
|  | 
 | ||||||
|         validate_examples(examples, "DependencyParser.score") |         validate_examples(examples, "DependencyParser.score") | ||||||
|         def dep_getter(token, attr): |         def dep_getter(token, attr): | ||||||
|             dep = getattr(token, attr) |             dep = getattr(token, attr) | ||||||
|             dep = token.vocab.strings.as_string(dep).lower() |             dep = token.vocab.strings.as_string(dep).lower() | ||||||
|             return dep |             return dep | ||||||
|         results = {} |         results = {} | ||||||
|         results.update(Scorer.score_spans(examples, "sents", **kwargs)) |         results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) | ||||||
|         kwargs.setdefault("getter", dep_getter) |         kwargs.setdefault("getter", dep_getter) | ||||||
|         kwargs.setdefault("ignore_labels", ("p", "punct")) |         kwargs.setdefault("ignore_labels", ("p", "punct")) | ||||||
|         results.update(Scorer.score_deps(examples, "dep", **kwargs)) |         results.update(Scorer.score_deps(examples, "dep", **kwargs)) | ||||||
|  |  | ||||||
|  | @ -10,7 +10,7 @@ from ..errors import Errors | ||||||
| from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList | from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList | ||||||
| from ..tokens import Doc, Span | from ..tokens import Doc, Span | ||||||
| from ..matcher import Matcher, PhraseMatcher | from ..matcher import Matcher, PhraseMatcher | ||||||
| from ..scorer import Scorer | from ..scorer import get_ner_prf | ||||||
| from ..training import validate_examples | from ..training import validate_examples | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -340,7 +340,7 @@ class EntityRuler(Pipe): | ||||||
| 
 | 
 | ||||||
|     def score(self, examples, **kwargs): |     def score(self, examples, **kwargs): | ||||||
|         validate_examples(examples, "EntityRuler.score") |         validate_examples(examples, "EntityRuler.score") | ||||||
|         return Scorer.score_spans(examples, "ents", **kwargs) |         return get_ner_prf(examples) | ||||||
| 
 | 
 | ||||||
|     def from_bytes( |     def from_bytes( | ||||||
|         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() |         self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList() | ||||||
|  |  | ||||||
|  | @ -251,10 +251,13 @@ class Morphologizer(Tagger): | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/morphologizer#score |         DOCS: https://nightly.spacy.io/api/morphologizer#score | ||||||
|         """ |         """ | ||||||
|  |         def morph_key_getter(token, attr): | ||||||
|  |             return getattr(token, attr).key | ||||||
|  | 
 | ||||||
|         validate_examples(examples, "Morphologizer.score") |         validate_examples(examples, "Morphologizer.score") | ||||||
|         results = {} |         results = {} | ||||||
|         results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) |         results.update(Scorer.score_token_attr(examples, "pos", **kwargs)) | ||||||
|         results.update(Scorer.score_token_attr(examples, "morph", **kwargs)) |         results.update(Scorer.score_token_attr(examples, "morph", getter=morph_key_getter, **kwargs)) | ||||||
|         results.update(Scorer.score_token_attr_per_feat(examples, |         results.update(Scorer.score_token_attr_per_feat(examples, | ||||||
|             "morph", **kwargs)) |             "morph", getter=morph_key_getter, **kwargs)) | ||||||
|         return results |         return results | ||||||
|  |  | ||||||
|  | @ -122,13 +122,4 @@ cdef class EntityRecognizer(Parser): | ||||||
|         DOCS: https://nightly.spacy.io/api/entityrecognizer#score |         DOCS: https://nightly.spacy.io/api/entityrecognizer#score | ||||||
|         """ |         """ | ||||||
|         validate_examples(examples, "EntityRecognizer.score") |         validate_examples(examples, "EntityRecognizer.score") | ||||||
|         score_per_type = get_ner_prf(examples) |         return get_ner_prf(examples) | ||||||
|         totals = PRFScore() |  | ||||||
|         for prf in score_per_type.values(): |  | ||||||
|             totals += prf |  | ||||||
|         return { |  | ||||||
|             "ents_p": totals.precision, |  | ||||||
|             "ents_r": totals.recall, |  | ||||||
|             "ents_f": totals.fscore, |  | ||||||
|             "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|  | @ -155,8 +155,11 @@ class Sentencizer(Pipe): | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/sentencizer#score |         DOCS: https://nightly.spacy.io/api/sentencizer#score | ||||||
|         """ |         """ | ||||||
|  |         def has_sents(doc): | ||||||
|  |             return doc.has_annotation("SENT_START") | ||||||
|  | 
 | ||||||
|         validate_examples(examples, "Sentencizer.score") |         validate_examples(examples, "Sentencizer.score") | ||||||
|         results = Scorer.score_spans(examples, "sents", **kwargs) |         results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) | ||||||
|         del results["sents_per_type"] |         del results["sents_per_type"] | ||||||
|         return results |         return results | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -160,7 +160,10 @@ class SentenceRecognizer(Tagger): | ||||||
|         RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. |         RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. | ||||||
|         DOCS: https://nightly.spacy.io/api/sentencerecognizer#score |         DOCS: https://nightly.spacy.io/api/sentencerecognizer#score | ||||||
|         """ |         """ | ||||||
|  |         def has_sents(doc): | ||||||
|  |             return doc.has_annotation("SENT_START") | ||||||
|  | 
 | ||||||
|         validate_examples(examples, "SentenceRecognizer.score") |         validate_examples(examples, "SentenceRecognizer.score") | ||||||
|         results = Scorer.score_spans(examples, "sents", **kwargs) |         results = Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs) | ||||||
|         del results["sents_per_type"] |         del results["sents_per_type"] | ||||||
|         return results |         return results | ||||||
|  |  | ||||||
|  | @ -16,15 +16,30 @@ from ..vocab import Vocab | ||||||
| 
 | 
 | ||||||
| default_model_config = """ | default_model_config = """ | ||||||
| [model] | [model] | ||||||
| @architectures = "spacy.TextCatEnsemble.v1" | @architectures = "spacy.TextCatEnsemble.v2" | ||||||
| exclusive_classes = false | 
 | ||||||
| pretrained_vectors = null | [model.tok2vec] | ||||||
|  | @architectures = "spacy.Tok2Vec.v1" | ||||||
|  | 
 | ||||||
|  | [model.tok2vec.embed] | ||||||
|  | @architectures = "spacy.MultiHashEmbed.v1" | ||||||
| width = 64 | width = 64 | ||||||
| conv_depth = 2 | rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||||
| embed_size = 2000 | attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||||
|  | include_static_vectors = false | ||||||
|  | 
 | ||||||
|  | [model.tok2vec.encode] | ||||||
|  | @architectures = "spacy.MaxoutWindowEncoder.v1" | ||||||
|  | width = ${model.tok2vec.embed.width} | ||||||
| window_size = 1 | window_size = 1 | ||||||
|  | maxout_pieces = 3 | ||||||
|  | depth = 2 | ||||||
|  | 
 | ||||||
|  | [model.linear_model] | ||||||
|  | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | exclusive_classes = false | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| dropout = null | no_output_layer = false | ||||||
| """ | """ | ||||||
| DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] | DEFAULT_TEXTCAT_MODEL = Config().from_str(default_model_config)["model"] | ||||||
| 
 | 
 | ||||||
|  | @ -60,9 +75,11 @@ subword_features = true | ||||||
|     default_score_weights={ |     default_score_weights={ | ||||||
|         "cats_score": 1.0, |         "cats_score": 1.0, | ||||||
|         "cats_score_desc": None, |         "cats_score_desc": None, | ||||||
|         "cats_p": None, |         "cats_micro_p": None, | ||||||
|         "cats_r": None, |         "cats_micro_r": None, | ||||||
|         "cats_f": None, |         "cats_micro_f": None, | ||||||
|  |         "cats_macro_p": None, | ||||||
|  |         "cats_macro_r": None, | ||||||
|         "cats_macro_f": None, |         "cats_macro_f": None, | ||||||
|         "cats_macro_auc": None, |         "cats_macro_auc": None, | ||||||
|         "cats_f_per_type": None, |         "cats_f_per_type": None, | ||||||
|  |  | ||||||
							
								
								
									
										287
									
								
								spacy/scorer.py
									
									
									
									
									
								
							
							
						
						
									
										287
									
								
								spacy/scorer.py
									
									
									
									
									
								
							|  | @ -1,9 +1,9 @@ | ||||||
| from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING | from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING | ||||||
| import numpy as np | import numpy as np | ||||||
| from collections import defaultdict | from collections import defaultdict | ||||||
| 
 | 
 | ||||||
| from .training import Example | from .training import Example | ||||||
| from .tokens import Token, Doc, Span | from .tokens import Token, Doc, Span, MorphAnalysis | ||||||
| from .errors import Errors | from .errors import Errors | ||||||
| from .util import get_lang_class, SimpleFrozenList | from .util import get_lang_class, SimpleFrozenList | ||||||
| from .morphology import Morphology | from .morphology import Morphology | ||||||
|  | @ -13,7 +13,8 @@ if TYPE_CHECKING: | ||||||
|     from .language import Language  # noqa: F401 |     from .language import Language  # noqa: F401 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] | DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat") | ||||||
|  | MISSING_VALUES = frozenset([None, 0, ""]) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class PRFScore: | class PRFScore: | ||||||
|  | @ -24,6 +25,9 @@ class PRFScore: | ||||||
|         self.fp = 0 |         self.fp = 0 | ||||||
|         self.fn = 0 |         self.fn = 0 | ||||||
| 
 | 
 | ||||||
|  |     def __len__(self) -> int: | ||||||
|  |         return self.tp + self.fp + self.fn | ||||||
|  | 
 | ||||||
|     def __iadd__(self, other): |     def __iadd__(self, other): | ||||||
|         self.tp += other.tp |         self.tp += other.tp | ||||||
|         self.fp += other.fp |         self.fp += other.fp | ||||||
|  | @ -59,7 +63,9 @@ class PRFScore: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class ROCAUCScore: | class ROCAUCScore: | ||||||
|     """An AUC ROC score.""" |     """An AUC ROC score. This is only defined for binary classification. | ||||||
|  |     Use the method is_binary before calculating the score, otherwise it | ||||||
|  |     may throw an error.""" | ||||||
| 
 | 
 | ||||||
|     def __init__(self) -> None: |     def __init__(self) -> None: | ||||||
|         self.golds = [] |         self.golds = [] | ||||||
|  | @ -71,16 +77,16 @@ class ROCAUCScore: | ||||||
|         self.cands.append(cand) |         self.cands.append(cand) | ||||||
|         self.golds.append(gold) |         self.golds.append(gold) | ||||||
| 
 | 
 | ||||||
|  |     def is_binary(self): | ||||||
|  |         return len(np.unique(self.golds)) == 2 | ||||||
|  | 
 | ||||||
|     @property |     @property | ||||||
|     def score(self): |     def score(self): | ||||||
|  |         if not self.is_binary(): | ||||||
|  |             raise ValueError(Errors.E165.format(label=set(self.golds))) | ||||||
|         if len(self.golds) == self.saved_score_at_len: |         if len(self.golds) == self.saved_score_at_len: | ||||||
|             return self.saved_score |             return self.saved_score | ||||||
|         try: |         self.saved_score = _roc_auc_score(self.golds, self.cands) | ||||||
|             self.saved_score = _roc_auc_score(self.golds, self.cands) |  | ||||||
|         # catch ValueError: Only one class present in y_true. |  | ||||||
|         # ROC AUC score is not defined in that case. |  | ||||||
|         except ValueError: |  | ||||||
|             self.saved_score = -float("inf") |  | ||||||
|         self.saved_score_at_len = len(self.golds) |         self.saved_score_at_len = len(self.golds) | ||||||
|         return self.saved_score |         return self.saved_score | ||||||
| 
 | 
 | ||||||
|  | @ -92,7 +98,7 @@ class Scorer: | ||||||
|         self, |         self, | ||||||
|         nlp: Optional["Language"] = None, |         nlp: Optional["Language"] = None, | ||||||
|         default_lang: str = "xx", |         default_lang: str = "xx", | ||||||
|         default_pipeline=DEFAULT_PIPELINE, |         default_pipeline: Iterable[str] = DEFAULT_PIPELINE, | ||||||
|         **cfg, |         **cfg, | ||||||
|     ) -> None: |     ) -> None: | ||||||
|         """Initialize the Scorer. |         """Initialize the Scorer. | ||||||
|  | @ -124,13 +130,13 @@ class Scorer: | ||||||
|         return scores |         return scores | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: |     def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]: | ||||||
|         """Returns accuracy and PRF scores for tokenization. |         """Returns accuracy and PRF scores for tokenization. | ||||||
|         * token_acc: # correct tokens / # gold tokens |         * token_acc: # correct tokens / # gold tokens | ||||||
|         * token_p/r/f: PRF for token character spans |         * token_p/r/f: PRF for token character spans | ||||||
| 
 | 
 | ||||||
|         examples (Iterable[Example]): Examples to score |         examples (Iterable[Example]): Examples to score | ||||||
|         RETURNS (Dict[str, float]): A dictionary containing the scores |         RETURNS (Dict[str, Any]): A dictionary containing the scores | ||||||
|             token_acc/p/r/f. |             token_acc/p/r/f. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/scorer#score_tokenization |         DOCS: https://nightly.spacy.io/api/scorer#score_tokenization | ||||||
|  | @ -140,6 +146,8 @@ class Scorer: | ||||||
|         for example in examples: |         for example in examples: | ||||||
|             gold_doc = example.reference |             gold_doc = example.reference | ||||||
|             pred_doc = example.predicted |             pred_doc = example.predicted | ||||||
|  |             if gold_doc.has_unknown_spaces: | ||||||
|  |                 continue | ||||||
|             align = example.alignment |             align = example.alignment | ||||||
|             gold_spans = set() |             gold_spans = set() | ||||||
|             pred_spans = set() |             pred_spans = set() | ||||||
|  | @ -156,12 +164,20 @@ class Scorer: | ||||||
|                 else: |                 else: | ||||||
|                     acc_score.tp += 1 |                     acc_score.tp += 1 | ||||||
|             prf_score.score_set(pred_spans, gold_spans) |             prf_score.score_set(pred_spans, gold_spans) | ||||||
|         return { |         if len(acc_score) > 0: | ||||||
|             "token_acc": acc_score.fscore, |             return { | ||||||
|             "token_p": prf_score.precision, |                 "token_acc": acc_score.fscore, | ||||||
|             "token_r": prf_score.recall, |                 "token_p": prf_score.precision, | ||||||
|             "token_f": prf_score.fscore, |                 "token_r": prf_score.recall, | ||||||
|         } |                 "token_f": prf_score.fscore, | ||||||
|  |             } | ||||||
|  |         else: | ||||||
|  |             return { | ||||||
|  |                 "token_acc": None, | ||||||
|  |                 "token_p": None, | ||||||
|  |                 "token_r": None, | ||||||
|  |                 "token_f": None | ||||||
|  |             } | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def score_token_attr( |     def score_token_attr( | ||||||
|  | @ -169,8 +185,9 @@ class Scorer: | ||||||
|         attr: str, |         attr: str, | ||||||
|         *, |         *, | ||||||
|         getter: Callable[[Token, str], Any] = getattr, |         getter: Callable[[Token, str], Any] = getattr, | ||||||
|  |         missing_values: Set[Any] = MISSING_VALUES, | ||||||
|         **cfg, |         **cfg, | ||||||
|     ) -> Dict[str, float]: |     ) -> Dict[str, Any]: | ||||||
|         """Returns an accuracy score for a token-level attribute. |         """Returns an accuracy score for a token-level attribute. | ||||||
| 
 | 
 | ||||||
|         examples (Iterable[Example]): Examples to score |         examples (Iterable[Example]): Examples to score | ||||||
|  | @ -178,7 +195,7 @@ class Scorer: | ||||||
|         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, |         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, | ||||||
|             getter(token, attr) should return the value of the attribute for an |             getter(token, attr) should return the value of the attribute for an | ||||||
|             individual token. |             individual token. | ||||||
|         RETURNS (Dict[str, float]): A dictionary containing the accuracy score |         RETURNS (Dict[str, Any]): A dictionary containing the accuracy score | ||||||
|             under the key attr_acc. |             under the key attr_acc. | ||||||
| 
 | 
 | ||||||
|         DOCS: https://nightly.spacy.io/api/scorer#score_token_attr |         DOCS: https://nightly.spacy.io/api/scorer#score_token_attr | ||||||
|  | @ -189,17 +206,27 @@ class Scorer: | ||||||
|             pred_doc = example.predicted |             pred_doc = example.predicted | ||||||
|             align = example.alignment |             align = example.alignment | ||||||
|             gold_tags = set() |             gold_tags = set() | ||||||
|  |             missing_indices = set() | ||||||
|             for gold_i, token in enumerate(gold_doc): |             for gold_i, token in enumerate(gold_doc): | ||||||
|                 gold_tags.add((gold_i, getter(token, attr))) |                 value = getter(token, attr) | ||||||
|  |                 if value not in missing_values: | ||||||
|  |                     gold_tags.add((gold_i, getter(token, attr))) | ||||||
|  |                 else: | ||||||
|  |                     missing_indices.add(gold_i) | ||||||
|             pred_tags = set() |             pred_tags = set() | ||||||
|             for token in pred_doc: |             for token in pred_doc: | ||||||
|                 if token.orth_.isspace(): |                 if token.orth_.isspace(): | ||||||
|                     continue |                     continue | ||||||
|                 if align.x2y.lengths[token.i] == 1: |                 if align.x2y.lengths[token.i] == 1: | ||||||
|                     gold_i = align.x2y[token.i].dataXd[0, 0] |                     gold_i = align.x2y[token.i].dataXd[0, 0] | ||||||
|                     pred_tags.add((gold_i, getter(token, attr))) |                     if gold_i not in missing_indices: | ||||||
|  |                         pred_tags.add((gold_i, getter(token, attr))) | ||||||
|             tag_score.score_set(pred_tags, gold_tags) |             tag_score.score_set(pred_tags, gold_tags) | ||||||
|         return {f"{attr}_acc": tag_score.fscore} |         score_key = f"{attr}_acc" | ||||||
|  |         if len(tag_score) == 0: | ||||||
|  |             return {score_key: None} | ||||||
|  |         else: | ||||||
|  |             return {score_key: tag_score.fscore} | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def score_token_attr_per_feat( |     def score_token_attr_per_feat( | ||||||
|  | @ -207,8 +234,9 @@ class Scorer: | ||||||
|         attr: str, |         attr: str, | ||||||
|         *, |         *, | ||||||
|         getter: Callable[[Token, str], Any] = getattr, |         getter: Callable[[Token, str], Any] = getattr, | ||||||
|  |         missing_values: Set[Any] = MISSING_VALUES, | ||||||
|         **cfg, |         **cfg, | ||||||
|     ): |     ) -> Dict[str, Any]: | ||||||
|         """Return PRF scores per feat for a token attribute in UFEATS format. |         """Return PRF scores per feat for a token attribute in UFEATS format. | ||||||
| 
 | 
 | ||||||
|         examples (Iterable[Example]): Examples to score |         examples (Iterable[Example]): Examples to score | ||||||
|  | @ -216,7 +244,7 @@ class Scorer: | ||||||
|         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, |         getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, | ||||||
|             getter(token, attr) should return the value of the attribute for an |             getter(token, attr) should return the value of the attribute for an | ||||||
|             individual token. |             individual token. | ||||||
|         RETURNS (dict): A dictionary containing the per-feat PRF scores unders |         RETURNS (dict): A dictionary containing the per-feat PRF scores under | ||||||
|             the key attr_per_feat. |             the key attr_per_feat. | ||||||
|         """ |         """ | ||||||
|         per_feat = {} |         per_feat = {} | ||||||
|  | @ -225,9 +253,11 @@ class Scorer: | ||||||
|             gold_doc = example.reference |             gold_doc = example.reference | ||||||
|             align = example.alignment |             align = example.alignment | ||||||
|             gold_per_feat = {} |             gold_per_feat = {} | ||||||
|  |             missing_indices = set() | ||||||
|             for gold_i, token in enumerate(gold_doc): |             for gold_i, token in enumerate(gold_doc): | ||||||
|                 morph = str(getter(token, attr)) |                 value = getter(token, attr) | ||||||
|                 if morph: |                 morph = gold_doc.vocab.strings[value] | ||||||
|  |                 if value not in missing_values and morph != Morphology.EMPTY_MORPH: | ||||||
|                     for feat in morph.split(Morphology.FEATURE_SEP): |                     for feat in morph.split(Morphology.FEATURE_SEP): | ||||||
|                         field, values = feat.split(Morphology.FIELD_SEP) |                         field, values = feat.split(Morphology.FIELD_SEP) | ||||||
|                         if field not in per_feat: |                         if field not in per_feat: | ||||||
|  | @ -235,27 +265,35 @@ class Scorer: | ||||||
|                         if field not in gold_per_feat: |                         if field not in gold_per_feat: | ||||||
|                             gold_per_feat[field] = set() |                             gold_per_feat[field] = set() | ||||||
|                         gold_per_feat[field].add((gold_i, feat)) |                         gold_per_feat[field].add((gold_i, feat)) | ||||||
|  |                 else: | ||||||
|  |                     missing_indices.add(gold_i) | ||||||
|             pred_per_feat = {} |             pred_per_feat = {} | ||||||
|             for token in pred_doc: |             for token in pred_doc: | ||||||
|                 if token.orth_.isspace(): |                 if token.orth_.isspace(): | ||||||
|                     continue |                     continue | ||||||
|                 if align.x2y.lengths[token.i] == 1: |                 if align.x2y.lengths[token.i] == 1: | ||||||
|                     gold_i = align.x2y[token.i].dataXd[0, 0] |                     gold_i = align.x2y[token.i].dataXd[0, 0] | ||||||
|                     morph = str(getter(token, attr)) |                     if gold_i not in missing_indices: | ||||||
|                     if morph: |                         value = getter(token, attr) | ||||||
|                         for feat in morph.split("|"): |                         morph = gold_doc.vocab.strings[value] | ||||||
|                             field, values = feat.split("=") |                         if value not in missing_values and morph != Morphology.EMPTY_MORPH: | ||||||
|                             if field not in per_feat: |                             for feat in morph.split(Morphology.FEATURE_SEP): | ||||||
|                                 per_feat[field] = PRFScore() |                                 field, values = feat.split(Morphology.FIELD_SEP) | ||||||
|                             if field not in pred_per_feat: |                                 if field not in per_feat: | ||||||
|                                 pred_per_feat[field] = set() |                                     per_feat[field] = PRFScore() | ||||||
|                             pred_per_feat[field].add((gold_i, feat)) |                                 if field not in pred_per_feat: | ||||||
|  |                                     pred_per_feat[field] = set() | ||||||
|  |                                 pred_per_feat[field].add((gold_i, feat)) | ||||||
|             for field in per_feat: |             for field in per_feat: | ||||||
|                 per_feat[field].score_set( |                 per_feat[field].score_set( | ||||||
|                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) |                     pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) | ||||||
|                 ) |                 ) | ||||||
|         result = {k: v.to_dict() for k, v in per_feat.items()} |         score_key = f"{attr}_per_feat" | ||||||
|         return {f"{attr}_per_feat": result} |         if any([len(v) for v in per_feat.values()]): | ||||||
|  |             result = {k: v.to_dict() for k, v in per_feat.items()} | ||||||
|  |             return {score_key: result} | ||||||
|  |         else: | ||||||
|  |             return {score_key: None} | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def score_spans( |     def score_spans( | ||||||
|  | @ -263,6 +301,7 @@ class Scorer: | ||||||
|         attr: str, |         attr: str, | ||||||
|         *, |         *, | ||||||
|         getter: Callable[[Doc, str], Iterable[Span]] = getattr, |         getter: Callable[[Doc, str], Iterable[Span]] = getattr, | ||||||
|  |         has_annotation: Optional[Callable[[Doc], bool]] = None, | ||||||
|         **cfg, |         **cfg, | ||||||
|     ) -> Dict[str, Any]: |     ) -> Dict[str, Any]: | ||||||
|         """Returns PRF scores for labeled spans. |         """Returns PRF scores for labeled spans. | ||||||
|  | @ -282,18 +321,10 @@ class Scorer: | ||||||
|         for example in examples: |         for example in examples: | ||||||
|             pred_doc = example.predicted |             pred_doc = example.predicted | ||||||
|             gold_doc = example.reference |             gold_doc = example.reference | ||||||
|             # TODO |             # Option to handle docs without sents | ||||||
|             # This is a temporary hack to work around the problem that the scorer |             if has_annotation is not None: | ||||||
|             # fails if you have examples that are not fully annotated for all |                 if not has_annotation(gold_doc): | ||||||
|             # the tasks in your pipeline. For instance, you might have a corpus |                     continue | ||||||
|             # of NER annotations that does not set sentence boundaries, but the |  | ||||||
|             # pipeline includes a parser or senter, and then the score_weights |  | ||||||
|             # are used to evaluate that component. When the scorer attempts |  | ||||||
|             # to read the sentences from the gold document, it fails. |  | ||||||
|             try: |  | ||||||
|                 list(getter(gold_doc, attr)) |  | ||||||
|             except ValueError: |  | ||||||
|                 continue |  | ||||||
|             # Find all labels in gold and doc |             # Find all labels in gold and doc | ||||||
|             labels = set( |             labels = set( | ||||||
|                 [k.label_ for k in getter(gold_doc, attr)] |                 [k.label_ for k in getter(gold_doc, attr)] | ||||||
|  | @ -321,13 +352,21 @@ class Scorer: | ||||||
|                     v.score_set(pred_per_type[k], gold_per_type[k]) |                     v.score_set(pred_per_type[k], gold_per_type[k]) | ||||||
|             # Score for all labels |             # Score for all labels | ||||||
|             score.score_set(pred_spans, gold_spans) |             score.score_set(pred_spans, gold_spans) | ||||||
|         results = { |         if len(score) > 0: | ||||||
|             f"{attr}_p": score.precision, |             return { | ||||||
|             f"{attr}_r": score.recall, |                 f"{attr}_p": score.precision, | ||||||
|             f"{attr}_f": score.fscore, |                 f"{attr}_r": score.recall, | ||||||
|             f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, |                 f"{attr}_f": score.fscore, | ||||||
|         } |                 f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, | ||||||
|         return results |             } | ||||||
|  |         else: | ||||||
|  |             return { | ||||||
|  |                 f"{attr}_p": None, | ||||||
|  |                 f"{attr}_r": None, | ||||||
|  |                 f"{attr}_f": None, | ||||||
|  |                 f"{attr}_per_type": None, | ||||||
|  |             } | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def score_cats( |     def score_cats( | ||||||
|  | @ -362,9 +401,13 @@ class Scorer: | ||||||
|             for all: |             for all: | ||||||
|                 attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), |                 attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), | ||||||
|                 attr_score_desc (text description of the overall score), |                 attr_score_desc (text description of the overall score), | ||||||
|  |                 attr_micro_p, | ||||||
|  |                 attr_micro_r, | ||||||
|                 attr_micro_f, |                 attr_micro_f, | ||||||
|  |                 attr_macro_p, | ||||||
|  |                 attr_macro_r, | ||||||
|                 attr_macro_f, |                 attr_macro_f, | ||||||
|                 attr_auc, |                 attr_macro_auc, | ||||||
|                 attr_f_per_type, |                 attr_f_per_type, | ||||||
|                 attr_auc_per_type |                 attr_auc_per_type | ||||||
| 
 | 
 | ||||||
|  | @ -384,9 +427,6 @@ class Scorer: | ||||||
|             pred_cats = getter(example.predicted, attr) |             pred_cats = getter(example.predicted, attr) | ||||||
|             gold_cats = getter(example.reference, attr) |             gold_cats = getter(example.reference, attr) | ||||||
| 
 | 
 | ||||||
|             # I think the AUC metric is applicable regardless of whether we're |  | ||||||
|             # doing multi-label classification? Unsure. If not, move this into |  | ||||||
|             # the elif pred_cats and gold_cats block below. |  | ||||||
|             for label in labels: |             for label in labels: | ||||||
|                 pred_score = pred_cats.get(label, 0.0) |                 pred_score = pred_cats.get(label, 0.0) | ||||||
|                 gold_score = gold_cats.get(label, 0.0) |                 gold_score = gold_cats.get(label, 0.0) | ||||||
|  | @ -431,7 +471,9 @@ class Scorer: | ||||||
|         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats |         macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats | ||||||
|         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats |         macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats | ||||||
|         macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats |         macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats | ||||||
|         macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats |         # Limit macro_auc to those labels with gold annotations, | ||||||
|  |         # but still divide by all cats to avoid artificial boosting of datasets with missing labels | ||||||
|  |         macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats | ||||||
|         results = { |         results = { | ||||||
|             f"{attr}_score": None, |             f"{attr}_score": None, | ||||||
|             f"{attr}_score_desc": None, |             f"{attr}_score_desc": None, | ||||||
|  | @ -443,7 +485,7 @@ class Scorer: | ||||||
|             f"{attr}_macro_f": macro_f, |             f"{attr}_macro_f": macro_f, | ||||||
|             f"{attr}_macro_auc": macro_auc, |             f"{attr}_macro_auc": macro_auc, | ||||||
|             f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, |             f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, | ||||||
|             f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, |             f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()}, | ||||||
|         } |         } | ||||||
|         if len(labels) == 2 and not multi_label and positive_label: |         if len(labels) == 2 and not multi_label and positive_label: | ||||||
|             positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] |             positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] | ||||||
|  | @ -534,6 +576,7 @@ class Scorer: | ||||||
|         head_attr: str = "head", |         head_attr: str = "head", | ||||||
|         head_getter: Callable[[Token, str], Token] = getattr, |         head_getter: Callable[[Token, str], Token] = getattr, | ||||||
|         ignore_labels: Iterable[str] = SimpleFrozenList(), |         ignore_labels: Iterable[str] = SimpleFrozenList(), | ||||||
|  |         missing_values: Set[Any] = MISSING_VALUES, | ||||||
|         **cfg, |         **cfg, | ||||||
|     ) -> Dict[str, Any]: |     ) -> Dict[str, Any]: | ||||||
|         """Returns the UAS, LAS, and LAS per type scores for dependency |         """Returns the UAS, LAS, and LAS per type scores for dependency | ||||||
|  | @ -558,6 +601,7 @@ class Scorer: | ||||||
|         unlabelled = PRFScore() |         unlabelled = PRFScore() | ||||||
|         labelled = PRFScore() |         labelled = PRFScore() | ||||||
|         labelled_per_dep = dict() |         labelled_per_dep = dict() | ||||||
|  |         missing_indices = set() | ||||||
|         for example in examples: |         for example in examples: | ||||||
|             gold_doc = example.reference |             gold_doc = example.reference | ||||||
|             pred_doc = example.predicted |             pred_doc = example.predicted | ||||||
|  | @ -567,13 +611,16 @@ class Scorer: | ||||||
|             for gold_i, token in enumerate(gold_doc): |             for gold_i, token in enumerate(gold_doc): | ||||||
|                 dep = getter(token, attr) |                 dep = getter(token, attr) | ||||||
|                 head = head_getter(token, head_attr) |                 head = head_getter(token, head_attr) | ||||||
|                 if dep not in ignore_labels: |                 if dep not in missing_values: | ||||||
|                     gold_deps.add((gold_i, head.i, dep)) |                     if dep not in ignore_labels: | ||||||
|                     if dep not in labelled_per_dep: |                         gold_deps.add((gold_i, head.i, dep)) | ||||||
|                         labelled_per_dep[dep] = PRFScore() |                         if dep not in labelled_per_dep: | ||||||
|                     if dep not in gold_deps_per_dep: |                             labelled_per_dep[dep] = PRFScore() | ||||||
|                         gold_deps_per_dep[dep] = set() |                         if dep not in gold_deps_per_dep: | ||||||
|                     gold_deps_per_dep[dep].add((gold_i, head.i, dep)) |                             gold_deps_per_dep[dep] = set() | ||||||
|  |                         gold_deps_per_dep[dep].add((gold_i, head.i, dep)) | ||||||
|  |                 else: | ||||||
|  |                     missing_indices.add(gold_i) | ||||||
|             pred_deps = set() |             pred_deps = set() | ||||||
|             pred_deps_per_dep = {} |             pred_deps_per_dep = {} | ||||||
|             for token in pred_doc: |             for token in pred_doc: | ||||||
|  | @ -583,25 +630,26 @@ class Scorer: | ||||||
|                     gold_i = None |                     gold_i = None | ||||||
|                 else: |                 else: | ||||||
|                     gold_i = align.x2y[token.i].dataXd[0, 0] |                     gold_i = align.x2y[token.i].dataXd[0, 0] | ||||||
|                 dep = getter(token, attr) |                 if gold_i not in missing_indices: | ||||||
|                 head = head_getter(token, head_attr) |                     dep = getter(token, attr) | ||||||
|                 if dep not in ignore_labels and token.orth_.strip(): |                     head = head_getter(token, head_attr) | ||||||
|                     if align.x2y.lengths[head.i] == 1: |                     if dep not in ignore_labels and token.orth_.strip(): | ||||||
|                         gold_head = align.x2y[head.i].dataXd[0, 0] |                         if align.x2y.lengths[head.i] == 1: | ||||||
|                     else: |                             gold_head = align.x2y[head.i].dataXd[0, 0] | ||||||
|                         gold_head = None |                         else: | ||||||
|                     # None is indistinct, so we can't just add it to the set |                             gold_head = None | ||||||
|                     # Multiple (None, None) deps are possible |                         # None is indistinct, so we can't just add it to the set | ||||||
|                     if gold_i is None or gold_head is None: |                         # Multiple (None, None) deps are possible | ||||||
|                         unlabelled.fp += 1 |                         if gold_i is None or gold_head is None: | ||||||
|                         labelled.fp += 1 |                             unlabelled.fp += 1 | ||||||
|                     else: |                             labelled.fp += 1 | ||||||
|                         pred_deps.add((gold_i, gold_head, dep)) |                         else: | ||||||
|                         if dep not in labelled_per_dep: |                             pred_deps.add((gold_i, gold_head, dep)) | ||||||
|                             labelled_per_dep[dep] = PRFScore() |                             if dep not in labelled_per_dep: | ||||||
|                         if dep not in pred_deps_per_dep: |                                 labelled_per_dep[dep] = PRFScore() | ||||||
|                             pred_deps_per_dep[dep] = set() |                             if dep not in pred_deps_per_dep: | ||||||
|                         pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) |                                 pred_deps_per_dep[dep] = set() | ||||||
|  |                             pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) | ||||||
|             labelled.score_set(pred_deps, gold_deps) |             labelled.score_set(pred_deps, gold_deps) | ||||||
|             for dep in labelled_per_dep: |             for dep in labelled_per_dep: | ||||||
|                 labelled_per_dep[dep].score_set( |                 labelled_per_dep[dep].score_set( | ||||||
|  | @ -610,29 +658,34 @@ class Scorer: | ||||||
|             unlabelled.score_set( |             unlabelled.score_set( | ||||||
|                 set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) |                 set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) | ||||||
|             ) |             ) | ||||||
|         return { |         if len(unlabelled) > 0: | ||||||
|             f"{attr}_uas": unlabelled.fscore, |             return { | ||||||
|             f"{attr}_las": labelled.fscore, |                 f"{attr}_uas": unlabelled.fscore, | ||||||
|             f"{attr}_las_per_type": { |                 f"{attr}_las": labelled.fscore, | ||||||
|                 k: v.to_dict() for k, v in labelled_per_dep.items() |                 f"{attr}_las_per_type": { | ||||||
|             }, |                     k: v.to_dict() for k, v in labelled_per_dep.items() | ||||||
|         } |                 }, | ||||||
|  |             } | ||||||
|  |         else: | ||||||
|  |             return { | ||||||
|  |                 f"{attr}_uas": None, | ||||||
|  |                 f"{attr}_las": None, | ||||||
|  |                 f"{attr}_las_per_type": None, | ||||||
|  |             } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: | def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]: | ||||||
|     """Compute per-entity PRFScore objects for a sequence of examples. The |     """Compute micro-PRF and per-entity PRF scores for a sequence of examples. | ||||||
|     results are returned as a dictionary keyed by the entity type. You can |  | ||||||
|     add the PRFScore objects to get micro-averaged total. |  | ||||||
|     """ |     """ | ||||||
|     scores = defaultdict(PRFScore) |     score_per_type = defaultdict(PRFScore) | ||||||
|     for eg in examples: |     for eg in examples: | ||||||
|         if not eg.y.has_annotation("ENT_IOB"): |         if not eg.y.has_annotation("ENT_IOB"): | ||||||
|             continue |             continue | ||||||
|         golds = {(e.label_, e.start, e.end) for e in eg.y.ents} |         golds = {(e.label_, e.start, e.end) for e in eg.y.ents} | ||||||
|         align_x2y = eg.alignment.x2y |         align_x2y = eg.alignment.x2y | ||||||
|         for pred_ent in eg.x.ents: |         for pred_ent in eg.x.ents: | ||||||
|             if pred_ent.label_ not in scores: |             if pred_ent.label_ not in score_per_type: | ||||||
|                 scores[pred_ent.label_] = PRFScore() |                 score_per_type[pred_ent.label_] = PRFScore() | ||||||
|             indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() |             indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() | ||||||
|             if len(indices): |             if len(indices): | ||||||
|                 g_span = eg.y[indices[0] : indices[-1] + 1] |                 g_span = eg.y[indices[0] : indices[-1] + 1] | ||||||
|  | @ -642,13 +695,29 @@ def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: | ||||||
|                 if all(token.ent_iob != 0 for token in g_span): |                 if all(token.ent_iob != 0 for token in g_span): | ||||||
|                     key = (pred_ent.label_, indices[0], indices[-1] + 1) |                     key = (pred_ent.label_, indices[0], indices[-1] + 1) | ||||||
|                     if key in golds: |                     if key in golds: | ||||||
|                         scores[pred_ent.label_].tp += 1 |                         score_per_type[pred_ent.label_].tp += 1 | ||||||
|                         golds.remove(key) |                         golds.remove(key) | ||||||
|                     else: |                     else: | ||||||
|                         scores[pred_ent.label_].fp += 1 |                         score_per_type[pred_ent.label_].fp += 1 | ||||||
|         for label, start, end in golds: |         for label, start, end in golds: | ||||||
|             scores[label].fn += 1 |             score_per_type[label].fn += 1 | ||||||
|     return scores |     totals = PRFScore() | ||||||
|  |     for prf in score_per_type.values(): | ||||||
|  |         totals += prf | ||||||
|  |     if len(totals) > 0: | ||||||
|  |         return { | ||||||
|  |             "ents_p": totals.precision, | ||||||
|  |             "ents_r": totals.recall, | ||||||
|  |             "ents_f": totals.fscore, | ||||||
|  |             "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, | ||||||
|  |         } | ||||||
|  |     else: | ||||||
|  |         return { | ||||||
|  |             "ents_p": None, | ||||||
|  |             "ents_r": None, | ||||||
|  |             "ents_f": None, | ||||||
|  |             "ents_per_type": None, | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| ############################################################################# | ############################################################################# | ||||||
|  | @ -726,7 +795,7 @@ def _roc_auc_score(y_true, y_score): | ||||||
|             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ |             <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ | ||||||
|     """ |     """ | ||||||
|     if len(np.unique(y_true)) != 2: |     if len(np.unique(y_true)) != 2: | ||||||
|         raise ValueError(Errors.E165) |         raise ValueError(Errors.E165.format(label=np.unique(y_true))) | ||||||
|     fpr, tpr, _ = _roc_curve(y_true, y_score) |     fpr, tpr, _ = _roc_curve(y_true, y_score) | ||||||
|     return _auc(fpr, tpr) |     return _auc(fpr, tpr) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -218,11 +218,16 @@ def test_dependency_matcher_callback(en_vocab, doc): | ||||||
|     pattern = [ |     pattern = [ | ||||||
|         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}}, |         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "quick"}}, | ||||||
|     ] |     ] | ||||||
|  |     nomatch_pattern = [ | ||||||
|  |         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"ORTH": "NOMATCH"}}, | ||||||
|  |     ] | ||||||
| 
 | 
 | ||||||
|     matcher = DependencyMatcher(en_vocab) |     matcher = DependencyMatcher(en_vocab) | ||||||
|     mock = Mock() |     mock = Mock() | ||||||
|     matcher.add("pattern", [pattern], on_match=mock) |     matcher.add("pattern", [pattern], on_match=mock) | ||||||
|  |     matcher.add("nomatch_pattern", [nomatch_pattern], on_match=mock) | ||||||
|     matches = matcher(doc) |     matches = matcher(doc) | ||||||
|  |     assert len(matches) == 1 | ||||||
|     mock.assert_called_once_with(matcher, doc, 0, matches) |     mock.assert_called_once_with(matcher, doc, 0, matches) | ||||||
| 
 | 
 | ||||||
|     # check that matches with and without callback are the same (#4590) |     # check that matches with and without callback are the same (#4590) | ||||||
|  |  | ||||||
|  | @ -160,8 +160,8 @@ def test_attributeruler_score(nlp, pattern_dicts): | ||||||
|     scores = nlp.evaluate(dev_examples) |     scores = nlp.evaluate(dev_examples) | ||||||
|     # "cat" is the only correct lemma |     # "cat" is the only correct lemma | ||||||
|     assert scores["lemma_acc"] == pytest.approx(0.2) |     assert scores["lemma_acc"] == pytest.approx(0.2) | ||||||
|     # the empty morphs are correct |     # no morphs are set | ||||||
|     assert scores["morph_acc"] == pytest.approx(0.6) |     assert scores["morph_acc"] == None | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_attributeruler_rule_order(nlp): | def test_attributeruler_rule_order(nlp): | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ import pytest | ||||||
| from spacy.language import Language | from spacy.language import Language | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.lang.de import German | from spacy.lang.de import German | ||||||
|  | from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL | ||||||
| from spacy.tokens import Doc | from spacy.tokens import Doc | ||||||
| from spacy.util import registry, SimpleFrozenDict, combine_score_weights | from spacy.util import registry, SimpleFrozenDict, combine_score_weights | ||||||
| from thinc.api import Model, Linear, ConfigValidationError | from thinc.api import Model, Linear, ConfigValidationError | ||||||
|  | @ -156,15 +157,10 @@ def test_pipe_class_component_model(): | ||||||
|     name = "test_class_component_model" |     name = "test_class_component_model" | ||||||
|     default_config = { |     default_config = { | ||||||
|         "model": { |         "model": { | ||||||
|             "@architectures": "spacy.TextCatEnsemble.v1", |             "@architectures": "spacy.TextCatEnsemble.v2", | ||||||
|             "exclusive_classes": False, |             "tok2vec": DEFAULT_TOK2VEC_MODEL, | ||||||
|             "pretrained_vectors": None, |             "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, | ||||||
|             "width": 64, |                       "no_output_layer": False}, | ||||||
|             "embed_size": 2000, |  | ||||||
|             "window_size": 1, |  | ||||||
|             "conv_depth": 2, |  | ||||||
|             "ngram_size": 1, |  | ||||||
|             "dropout": None, |  | ||||||
|         }, |         }, | ||||||
|         "value1": 10, |         "value1": 10, | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -140,7 +140,7 @@ def test_overfitting_IO(): | ||||||
|     nlp = English() |     nlp = English() | ||||||
|     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} |     nlp.config["initialize"]["components"]["textcat"] = {"positive_label": "POSITIVE"} | ||||||
|     # Set exclusive labels |     # Set exclusive labels | ||||||
|     config = {"model": {"exclusive_classes": True}} |     config = {"model": {"linear_model": {"exclusive_classes": True}}} | ||||||
|     textcat = nlp.add_pipe("textcat", config=config) |     textcat = nlp.add_pipe("textcat", config=config) | ||||||
|     train_examples = [] |     train_examples = [] | ||||||
|     for text, annotations in TRAIN_DATA: |     for text, annotations in TRAIN_DATA: | ||||||
|  | @ -192,9 +192,8 @@ def test_overfitting_IO(): | ||||||
|         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, |         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, | ||||||
|         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, |         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, | ||||||
|         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, |         {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, | ||||||
|         {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, |         {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}, | ||||||
|         {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, |         {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}, | ||||||
|         {"@architectures": "spacy.TextCatEnsemble.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, |  | ||||||
|         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, |         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}, | ||||||
|         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, |         {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}, | ||||||
|     ], |     ], | ||||||
|  |  | ||||||
|  | @ -4,32 +4,23 @@ from thinc.api import fix_random_seed, Adam, set_dropout_rate | ||||||
| from numpy.testing import assert_array_equal | from numpy.testing import assert_array_equal | ||||||
| import numpy | import numpy | ||||||
| from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder | from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder | ||||||
| from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier | from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier | ||||||
| from spacy.ml.staticvectors import StaticVectors | from spacy.ml.staticvectors import StaticVectors | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.lang.en.examples import sentences as EN_SENTENCES | from spacy.lang.en.examples import sentences as EN_SENTENCES | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_textcat_kwargs(): | def get_textcat_bow_kwargs(): | ||||||
|     return { |     return { | ||||||
|         "width": 64, |         "exclusive_classes": True, | ||||||
|         "embed_size": 2000, |  | ||||||
|         "pretrained_vectors": None, |  | ||||||
|         "exclusive_classes": False, |  | ||||||
|         "ngram_size": 1, |         "ngram_size": 1, | ||||||
|         "window_size": 1, |         "no_output_layer": False, | ||||||
|         "conv_depth": 2, |         "nO": 34, | ||||||
|         "dropout": None, |  | ||||||
|         "nO": 7, |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_textcat_cnn_kwargs(): | def get_textcat_cnn_kwargs(): | ||||||
|     return { |     return {"tok2vec": test_tok2vec(), "exclusive_classes": False, "nO": 13} | ||||||
|         "tok2vec": test_tok2vec(), |  | ||||||
|         "exclusive_classes": False, |  | ||||||
|         "nO": 13, |  | ||||||
|     } |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_all_params(model): | def get_all_params(model): | ||||||
|  | @ -105,7 +96,7 @@ def test_multi_hash_embed(): | ||||||
|     "seed,model_func,kwargs", |     "seed,model_func,kwargs", | ||||||
|     [ |     [ | ||||||
|         (0, build_Tok2Vec_model, get_tok2vec_kwargs()), |         (0, build_Tok2Vec_model, get_tok2vec_kwargs()), | ||||||
|         (0, build_text_classifier, get_textcat_kwargs()), |         (0, build_bow_text_classifier, get_textcat_bow_kwargs()), | ||||||
|         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), |         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs()), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  | @ -125,7 +116,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs): | ||||||
|     "seed,model_func,kwargs,get_X", |     "seed,model_func,kwargs,get_X", | ||||||
|     [ |     [ | ||||||
|         (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), |         (0, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), | ||||||
|         (0, build_text_classifier, get_textcat_kwargs(), get_docs), |         (0, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), | ||||||
|         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), |         (0, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  | @ -160,7 +151,7 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X): | ||||||
|     "seed,dropout,model_func,kwargs,get_X", |     "seed,dropout,model_func,kwargs,get_X", | ||||||
|     [ |     [ | ||||||
|         (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), |         (0, 0.2, build_Tok2Vec_model, get_tok2vec_kwargs(), get_docs), | ||||||
|         (0, 0.2, build_text_classifier, get_textcat_kwargs(), get_docs), |         (0, 0.2, build_bow_text_classifier, get_textcat_bow_kwargs(), get_docs), | ||||||
|         (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), |         (0, 0.2, build_simple_cnn_text_classifier, get_textcat_cnn_kwargs(), get_docs), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | @ -277,6 +277,62 @@ def test_tag_score(tagged_doc): | ||||||
|     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) |     assert results["morph_per_feat"]["Number"]["f"] == approx(0.72727272) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_partial_annotation(en_tokenizer): | ||||||
|  |     pred_doc = en_tokenizer("a b c d e") | ||||||
|  |     pred_doc[0].tag_ = "A" | ||||||
|  |     pred_doc[0].pos_ = "X" | ||||||
|  |     pred_doc[0].set_morph("Feat=Val") | ||||||
|  |     pred_doc[0].dep_ = "dep" | ||||||
|  | 
 | ||||||
|  |     # unannotated reference | ||||||
|  |     ref_doc = en_tokenizer("a b c d e") | ||||||
|  |     ref_doc.has_unknown_spaces = True | ||||||
|  |     example = Example(pred_doc, ref_doc) | ||||||
|  |     scorer = Scorer() | ||||||
|  |     scores = scorer.score([example]) | ||||||
|  |     for key in scores: | ||||||
|  |         # cats doesn't have an unset state | ||||||
|  |         if key.startswith("cats"): | ||||||
|  |             continue | ||||||
|  |         assert scores[key] == None | ||||||
|  | 
 | ||||||
|  |     # partially annotated reference, not overlapping with predicted annotation | ||||||
|  |     ref_doc = en_tokenizer("a b c d e") | ||||||
|  |     ref_doc.has_unknown_spaces = True | ||||||
|  |     ref_doc[1].tag_ = "A" | ||||||
|  |     ref_doc[1].pos_ = "X" | ||||||
|  |     ref_doc[1].set_morph("Feat=Val") | ||||||
|  |     ref_doc[1].dep_ = "dep" | ||||||
|  |     example = Example(pred_doc, ref_doc) | ||||||
|  |     scorer = Scorer() | ||||||
|  |     scores = scorer.score([example]) | ||||||
|  |     assert scores["token_acc"] == None | ||||||
|  |     assert scores["tag_acc"] == 0.0 | ||||||
|  |     assert scores["pos_acc"] == 0.0 | ||||||
|  |     assert scores["morph_acc"] == 0.0 | ||||||
|  |     assert scores["dep_uas"] == 1.0 | ||||||
|  |     assert scores["dep_las"] == 0.0 | ||||||
|  |     assert scores["sents_f"] == None | ||||||
|  | 
 | ||||||
|  |     # partially annotated reference, overlapping with predicted annotation | ||||||
|  |     ref_doc = en_tokenizer("a b c d e") | ||||||
|  |     ref_doc.has_unknown_spaces = True | ||||||
|  |     ref_doc[0].tag_ = "A" | ||||||
|  |     ref_doc[0].pos_ = "X" | ||||||
|  |     ref_doc[1].set_morph("Feat=Val") | ||||||
|  |     ref_doc[1].dep_ = "dep" | ||||||
|  |     example = Example(pred_doc, ref_doc) | ||||||
|  |     scorer = Scorer() | ||||||
|  |     scores = scorer.score([example]) | ||||||
|  |     assert scores["token_acc"] == None | ||||||
|  |     assert scores["tag_acc"] == 1.0 | ||||||
|  |     assert scores["pos_acc"] == 1.0 | ||||||
|  |     assert scores["morph_acc"] == 0.0 | ||||||
|  |     assert scores["dep_uas"] == 1.0 | ||||||
|  |     assert scores["dep_las"] == 0.0 | ||||||
|  |     assert scores["sents_f"] == None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_roc_auc_score(): | def test_roc_auc_score(): | ||||||
|     # Binary classification, toy tests from scikit-learn test suite |     # Binary classification, toy tests from scikit-learn test suite | ||||||
|     y_true = [0, 1] |     y_true = [0, 1] | ||||||
|  | @ -334,7 +390,8 @@ def test_roc_auc_score(): | ||||||
|     score = ROCAUCScore() |     score = ROCAUCScore() | ||||||
|     score.score_set(0.25, 0) |     score.score_set(0.25, 0) | ||||||
|     score.score_set(0.75, 0) |     score.score_set(0.75, 0) | ||||||
|     assert score.score == -float("inf") |     with pytest.raises(ValueError): | ||||||
|  |         s = score.score | ||||||
| 
 | 
 | ||||||
|     y_true = [1, 1] |     y_true = [1, 1] | ||||||
|     y_score = [0.25, 0.75] |     y_score = [0.25, 0.75] | ||||||
|  | @ -344,4 +401,5 @@ def test_roc_auc_score(): | ||||||
|     score = ROCAUCScore() |     score = ROCAUCScore() | ||||||
|     score.score_set(0.25, 1) |     score.score_set(0.25, 1) | ||||||
|     score.score_set(0.75, 1) |     score.score_set(0.75, 1) | ||||||
|     assert score.score == -float("inf") |     with pytest.raises(ValueError): | ||||||
|  |         s = score.score | ||||||
|  |  | ||||||
|  | @ -51,7 +51,7 @@ def test_readers(): | ||||||
|     for example in train_corpus(nlp): |     for example in train_corpus(nlp): | ||||||
|         nlp.update([example], sgd=optimizer) |         nlp.update([example], sgd=optimizer) | ||||||
|     scores = nlp.evaluate(list(dev_corpus(nlp))) |     scores = nlp.evaluate(list(dev_corpus(nlp))) | ||||||
|     assert scores["cats_score"] |     assert scores["cats_score"] == 0.0 | ||||||
|     # ensure the pipeline runs |     # ensure the pipeline runs | ||||||
|     doc = nlp("Quick test") |     doc = nlp("Quick test") | ||||||
|     assert doc.cats |     assert doc.cats | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ import numpy | ||||||
| from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment | from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment | ||||||
| from spacy.training import biluo_tags_to_spans, iob_to_biluo | from spacy.training import biluo_tags_to_spans, iob_to_biluo | ||||||
| from spacy.training import Corpus, docs_to_json, Example | from spacy.training import Corpus, docs_to_json, Example | ||||||
|  | from spacy.training.align import get_alignments | ||||||
| from spacy.training.converters import json_to_docs | from spacy.training.converters import json_to_docs | ||||||
| from spacy.lang.en import English | from spacy.lang.en import English | ||||||
| from spacy.tokens import Doc, DocBin | from spacy.tokens import Doc, DocBin | ||||||
|  | @ -492,36 +493,35 @@ def test_roundtrip_docs_to_docbin(doc): | ||||||
|     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] |     assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.mark.skip("Outdated") |  | ||||||
| @pytest.mark.parametrize( | @pytest.mark.parametrize( | ||||||
|     "tokens_a,tokens_b,expected", |     "tokens_a,tokens_b,expected", | ||||||
|     [ |     [ | ||||||
|         (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})), |         (["a", "b", "c"], ["ab", "c"], ([[0], [0], [1]], [[0, 1], [2]])), | ||||||
|         ( |         ( | ||||||
|             ["a", "b", '"', "c"], |             ["a", "b", '"', "c"], | ||||||
|             ['ab"', "c"], |             ['ab"', "c"], | ||||||
|             (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}), |             ([[0], [0], [0], [1]], [[0, 1, 2], [3]]), | ||||||
|         ), |         ), | ||||||
|         (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})), |         (["a", "bc"], ["ab", "c"], ([[0], [0, 1]], [[0, 1], [1]])), | ||||||
|         ( |         ( | ||||||
|             ["ab", "c", "d"], |             ["ab", "c", "d"], | ||||||
|             ["a", "b", "cd"], |             ["a", "b", "cd"], | ||||||
|             (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}), |             ([[0, 1], [2], [2]], [[0], [0], [1, 2]]), | ||||||
|         ), |         ), | ||||||
|         ( |         ( | ||||||
|             ["a", "b", "cd"], |             ["a", "b", "cd"], | ||||||
|             ["a", "b", "c", "d"], |             ["a", "b", "c", "d"], | ||||||
|             (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}), |             ([[0], [1], [2, 3]], [[0], [1], [2], [2]]), | ||||||
|         ), |         ), | ||||||
|         ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})), |         ([" ", "a"], ["a"], ([[], [0]], [[1]])), | ||||||
|     ], |     ], | ||||||
| ) | ) | ||||||
| def test_align(tokens_a, tokens_b, expected):  # noqa | def test_align(tokens_a, tokens_b, expected):  # noqa | ||||||
|     cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)  # noqa |     a2b, b2a = get_alignments(tokens_a, tokens_b) | ||||||
|     assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected  # noqa |     assert (a2b, b2a) == expected  # noqa | ||||||
|     # check symmetry |     # check symmetry | ||||||
|     cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)  # noqa |     a2b, b2a = get_alignments(tokens_b, tokens_a)  # noqa | ||||||
|     assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected  # noqa |     assert (b2a, a2b) == expected  # noqa | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_goldparse_startswith_space(en_tokenizer): | def test_goldparse_startswith_space(en_tokenizer): | ||||||
|  | @ -539,6 +539,21 @@ def test_goldparse_startswith_space(en_tokenizer): | ||||||
|     assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] |     assert example.get_aligned("DEP", as_string=True) == [None, "ROOT"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_goldparse_endswith_space(en_tokenizer): | ||||||
|  |     text = "a\n" | ||||||
|  |     doc = en_tokenizer(text) | ||||||
|  |     gold_words = ["a"] | ||||||
|  |     entities = ["U-DATE"] | ||||||
|  |     deps = ["ROOT"] | ||||||
|  |     heads = [0] | ||||||
|  |     example = Example.from_dict( | ||||||
|  |         doc, {"words": gold_words, "entities": entities, "deps": deps, "heads": heads} | ||||||
|  |     ) | ||||||
|  |     ner_tags = example.get_aligned_ner() | ||||||
|  |     assert ner_tags == ["U-DATE", "O"] | ||||||
|  |     assert example.get_aligned("DEP", as_string=True) == ["ROOT", None] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_gold_constructor(): | def test_gold_constructor(): | ||||||
|     """Test that the Example constructor works fine""" |     """Test that the Example constructor works fine""" | ||||||
|     nlp = English() |     nlp = English() | ||||||
|  | @ -676,6 +691,87 @@ def test_alignment_different_texts(): | ||||||
|         Alignment.from_strings(other_tokens, spacy_tokens) |         Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_alignment_spaces(en_vocab): | ||||||
|  |     # single leading whitespace | ||||||
|  |     other_tokens = [" ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||||
|  |     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [0, 3, 1, 1, 1, 1, 1] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||||
|  |     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] | ||||||
|  |     assert list(align.y2x.dataXd) == [1, 1, 1, 2, 3, 4, 5, 6] | ||||||
|  | 
 | ||||||
|  |     # multiple leading whitespace tokens | ||||||
|  |     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||||
|  |     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [0, 0, 3, 1, 1, 1, 1, 1] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||||
|  |     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2,] | ||||||
|  |     assert list(align.y2x.dataXd) == [2, 2, 2, 3, 4, 5, 6, 7] | ||||||
|  | 
 | ||||||
|  |     # both with leading whitespace, not identical | ||||||
|  |     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||||
|  |     spacy_tokens = [" ", "i", "listened", "to", "obama", "'s", "podcasts."] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [1, 0, 3, 1, 1, 1, 1, 1] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 5, 5, 6, 6] | ||||||
|  |     assert list(align.y2x.lengths) == [1, 1, 1, 1, 1, 2, 2] | ||||||
|  |     assert list(align.y2x.dataXd) == [0, 2, 2, 2, 3, 4, 5, 6, 7] | ||||||
|  | 
 | ||||||
|  |     # same leading whitespace, different tokenization | ||||||
|  |     other_tokens = [" ", " ", "i listened to", "obama", "'", "s", "podcasts", "."] | ||||||
|  |     spacy_tokens = ["  ", "i", "listened", "to", "obama", "'s", "podcasts."] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [1, 1, 3, 1, 1, 1, 1, 1] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 0, 1, 2, 3, 4, 5, 5, 6, 6] | ||||||
|  |     assert list(align.y2x.lengths) == [2, 1, 1, 1, 1, 2, 2] | ||||||
|  |     assert list(align.y2x.dataXd) == [0, 1, 2, 2, 2, 3, 4, 5, 6, 7] | ||||||
|  | 
 | ||||||
|  |     # only one with trailing whitespace | ||||||
|  |     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " "] | ||||||
|  |     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts."] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 0] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5] | ||||||
|  |     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2] | ||||||
|  |     assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5] | ||||||
|  | 
 | ||||||
|  |     # different trailing whitespace | ||||||
|  |     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] | ||||||
|  |     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", " "] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 0] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6] | ||||||
|  |     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 1] | ||||||
|  |     assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6] | ||||||
|  | 
 | ||||||
|  |     # same trailing whitespace, different tokenization | ||||||
|  |     other_tokens = ["i listened to", "obama", "'", "s", "podcasts", ".", " ", " "] | ||||||
|  |     spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts.", "  "] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.lengths) == [3, 1, 1, 1, 1, 1, 1, 1] | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 2, 3, 4, 4, 5, 5, 6, 6] | ||||||
|  |     assert list(align.y2x.lengths) == [1, 1, 1, 1, 2, 2, 2] | ||||||
|  |     assert list(align.y2x.dataXd) == [0, 0, 0, 1, 2, 3, 4, 5, 6, 7] | ||||||
|  | 
 | ||||||
|  |     # differing whitespace is allowed | ||||||
|  |     other_tokens = ["a", " \n ", "b", "c"] | ||||||
|  |     spacy_tokens = ["a", "b", " ", "c"] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  |     assert list(align.x2y.dataXd) == [0, 1, 3] | ||||||
|  |     assert list(align.y2x.dataXd) == [0, 2, 3] | ||||||
|  | 
 | ||||||
|  |     # other differences in whitespace are allowed | ||||||
|  |     other_tokens = [" ", "a"] | ||||||
|  |     spacy_tokens = ["  ", "a", " "] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  | 
 | ||||||
|  |     other_tokens = ["a", " "] | ||||||
|  |     spacy_tokens = ["a", "  "] | ||||||
|  |     align = Alignment.from_strings(other_tokens, spacy_tokens) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_retokenized_docs(doc): | def test_retokenized_docs(doc): | ||||||
|     a = doc.to_array(["TAG"]) |     a = doc.to_array(["TAG"]) | ||||||
|     doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) |     doc1 = Doc(doc.vocab, words=[t.text for t in doc]).from_array(["TAG"], a) | ||||||
|  |  | ||||||
|  | @ -399,14 +399,13 @@ cdef class Doc: | ||||||
|             return True |             return True | ||||||
|         cdef int i |         cdef int i | ||||||
|         cdef int range_start = 0 |         cdef int range_start = 0 | ||||||
|  |         if attr == "IS_SENT_START" or attr == self.vocab.strings["IS_SENT_START"]: | ||||||
|  |             attr = SENT_START | ||||||
|         attr = intify_attr(attr) |         attr = intify_attr(attr) | ||||||
|         # adjust attributes |         # adjust attributes | ||||||
|         if attr == HEAD: |         if attr == HEAD: | ||||||
|             # HEAD does not have an unset state, so rely on DEP |             # HEAD does not have an unset state, so rely on DEP | ||||||
|             attr = DEP |             attr = DEP | ||||||
|         elif attr == self.vocab.strings["IS_SENT_START"]: |  | ||||||
|             # as in Matcher, allow IS_SENT_START as an alias of SENT_START |  | ||||||
|             attr = SENT_START |  | ||||||
|         # special cases for sentence boundaries |         # special cases for sentence boundaries | ||||||
|         if attr == SENT_START: |         if attr == SENT_START: | ||||||
|             if "sents" in self.user_hooks: |             if "sents" in self.user_hooks: | ||||||
|  |  | ||||||
|  | @ -1,6 +1,6 @@ | ||||||
| from .corpus import Corpus  # noqa: F401 | from .corpus import Corpus  # noqa: F401 | ||||||
| from .example import Example, validate_examples, validate_get_examples  # noqa: F401 | from .example import Example, validate_examples, validate_get_examples  # noqa: F401 | ||||||
| from .align import Alignment  # noqa: F401 | from .alignment import Alignment  # noqa: F401 | ||||||
| from .augment import dont_augment, orth_variants_augmenter  # noqa: F401 | from .augment import dont_augment, orth_variants_augmenter  # noqa: F401 | ||||||
| from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401 | from .iob_utils import iob_to_biluo, biluo_to_iob  # noqa: F401 | ||||||
| from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401 | from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets  # noqa: F401 | ||||||
|  |  | ||||||
							
								
								
									
										66
									
								
								spacy/training/align.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								spacy/training/align.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,66 @@ | ||||||
|  | from typing import List, Tuple | ||||||
|  | from itertools import chain | ||||||
|  | import re | ||||||
|  | 
 | ||||||
|  | from ..errors import Errors | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_alignments(A: List[str], B: List[str]) -> Tuple[List[List[int]], List[List[int]]]: | ||||||
|  |     # Create character-to-token mappings | ||||||
|  |     char_to_token_a = tuple(chain(*((i,) * len(x) for i, x in enumerate(A)))) | ||||||
|  |     char_to_token_b = tuple(chain(*((i,) * len(x) for i, x in enumerate(B)))) | ||||||
|  |     str_a = "".join(A).lower() | ||||||
|  |     str_b = "".join(B).lower() | ||||||
|  |     cdef int len_str_a = len(str_a) | ||||||
|  |     cdef int len_str_b = len(str_b) | ||||||
|  |     # Check that the two texts only differ in whitespace and capitalization | ||||||
|  |     if re.sub(r"\s+", "", str_a) != re.sub(r"\s+", "", str_b) or \ | ||||||
|  |             len_str_a != len(char_to_token_a) or \ | ||||||
|  |             len_str_b != len(char_to_token_b): | ||||||
|  |         raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) | ||||||
|  |     cdef int char_idx_a = 0 | ||||||
|  |     cdef int char_idx_b = 0 | ||||||
|  |     cdef int token_idx_a = 0 | ||||||
|  |     cdef int token_idx_b = 0 | ||||||
|  |     cdef int prev_token_idx_a = -1 | ||||||
|  |     cdef int prev_token_idx_b = -1 | ||||||
|  |     a2b = [] | ||||||
|  |     b2a = [] | ||||||
|  |     while char_idx_a < len_str_a and char_idx_b < len_str_b: | ||||||
|  |         # Find the current token position from the character position | ||||||
|  |         token_idx_a = char_to_token_a[char_idx_a] | ||||||
|  |         token_idx_b = char_to_token_b[char_idx_b] | ||||||
|  |         # Add a set for the next token if a token boundary has been crossed | ||||||
|  |         if prev_token_idx_a != token_idx_a: | ||||||
|  |             a2b.append(set()) | ||||||
|  |         if prev_token_idx_b != token_idx_b: | ||||||
|  |             b2a.append(set()) | ||||||
|  |         # Process the alignment at the current position | ||||||
|  |         if A[token_idx_a] == B[token_idx_b]: | ||||||
|  |             # Current tokens are identical | ||||||
|  |             a2b[-1].add(token_idx_b) | ||||||
|  |             b2a[-1].add(token_idx_a) | ||||||
|  |             char_idx_a += len(A[token_idx_a]) | ||||||
|  |             char_idx_b += len(B[token_idx_b]) | ||||||
|  |         elif str_a[char_idx_a] == str_b[char_idx_b]: | ||||||
|  |             # Current chars are identical | ||||||
|  |             a2b[-1].add(token_idx_b) | ||||||
|  |             b2a[-1].add(token_idx_a) | ||||||
|  |             char_idx_a += 1 | ||||||
|  |             char_idx_b += 1 | ||||||
|  |         elif str_a[char_idx_a].isspace(): | ||||||
|  |             # Skip unaligned whitespace char in A | ||||||
|  |             char_idx_a += 1 | ||||||
|  |         elif str_b[char_idx_b].isspace(): | ||||||
|  |             # Skip unaligned whitespace char in B | ||||||
|  |             char_idx_b += 1 | ||||||
|  |         else: | ||||||
|  |             # This should never happen | ||||||
|  |             raise ValueError(Errors.E949.format(x=str(A[:10]), y=str(B[:10]))) | ||||||
|  |         prev_token_idx_a = token_idx_a | ||||||
|  |         prev_token_idx_b = token_idx_b | ||||||
|  |     # Process unaligned trailing whitespace | ||||||
|  |     a2b.extend([set()] * len(set(char_to_token_a[char_idx_a:]))) | ||||||
|  |     b2a.extend([set()] * len(set(char_to_token_b[char_idx_b:]))) | ||||||
|  |     # Return values as sorted lists per token position | ||||||
|  |     return [sorted(x) for x in a2b], [sorted(x) for x in b2a] | ||||||
|  | @ -2,9 +2,8 @@ from typing import List | ||||||
| import numpy | import numpy | ||||||
| from thinc.types import Ragged | from thinc.types import Ragged | ||||||
| from dataclasses import dataclass | from dataclasses import dataclass | ||||||
| import tokenizations |  | ||||||
| 
 | 
 | ||||||
| from ..errors import Errors | from .align import get_alignments | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @dataclass | @dataclass | ||||||
|  | @ -20,9 +19,7 @@ class Alignment: | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": |     def from_strings(cls, A: List[str], B: List[str]) -> "Alignment": | ||||||
|         if "".join(A).replace(" ", "").lower() != "".join(B).replace(" ", "").lower(): |         x2y, y2x = get_alignments(A, B) | ||||||
|             raise ValueError(Errors.E949) |  | ||||||
|         x2y, y2x = tokenizations.get_alignments(A, B) |  | ||||||
|         return Alignment.from_indices(x2y=x2y, y2x=y2x) |         return Alignment.from_indices(x2y=x2y, y2x=y2x) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -7,7 +7,7 @@ from ..tokens.doc cimport Doc | ||||||
| from ..tokens.span cimport Span | from ..tokens.span cimport Span | ||||||
| from ..tokens.span import Span | from ..tokens.span import Span | ||||||
| from ..attrs import IDS | from ..attrs import IDS | ||||||
| from .align import Alignment | from .alignment import Alignment | ||||||
| from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags | from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags | ||||||
| from .iob_utils import biluo_tags_to_spans | from .iob_utils import biluo_tags_to_spans | ||||||
| from ..errors import Errors, Warnings | from ..errors import Errors, Warnings | ||||||
|  |  | ||||||
|  | @ -36,6 +36,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": | ||||||
|     # Resolve all training-relevant sections using the filled nlp config |     # Resolve all training-relevant sections using the filled nlp config | ||||||
|     T = registry.resolve(config["training"], schema=ConfigSchemaTraining) |     T = registry.resolve(config["training"], schema=ConfigSchemaTraining) | ||||||
|     dot_names = [T["train_corpus"], T["dev_corpus"]] |     dot_names = [T["train_corpus"], T["dev_corpus"]] | ||||||
|  |     if not isinstance(T["train_corpus"], str): | ||||||
|  |         raise ConfigValidationError(desc=Errors.E897.format(field="training.train_corpus", type=type(T["train_corpus"]))) | ||||||
|  |     if not isinstance(T["dev_corpus"], str): | ||||||
|  |         raise ConfigValidationError(desc=Errors.E897.format(field="training.dev_corpus", type=type(T["dev_corpus"]))) | ||||||
|     train_corpus, dev_corpus = resolve_dot_names(config, dot_names) |     train_corpus, dev_corpus = resolve_dot_names(config, dot_names) | ||||||
|     optimizer = T["optimizer"] |     optimizer = T["optimizer"] | ||||||
|     # Components that shouldn't be updated during training |     # Components that shouldn't be updated during training | ||||||
|  |  | ||||||
|  | @ -17,7 +17,7 @@ from ..ml.models.multi_task import build_cloze_multi_task_model | ||||||
| from ..ml.models.multi_task import build_cloze_characters_multi_task_model | from ..ml.models.multi_task import build_cloze_characters_multi_task_model | ||||||
| from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain | from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain | ||||||
| from ..errors import Errors | from ..errors import Errors | ||||||
| from ..util import registry, load_model_from_config, dot_to_object | from ..util import registry, load_model_from_config, resolve_dot_names | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def pretrain( | def pretrain( | ||||||
|  | @ -38,7 +38,7 @@ def pretrain( | ||||||
|     _config = nlp.config.interpolate() |     _config = nlp.config.interpolate() | ||||||
|     T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) |     T = registry.resolve(_config["training"], schema=ConfigSchemaTraining) | ||||||
|     P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) |     P = registry.resolve(_config["pretraining"], schema=ConfigSchemaPretrain) | ||||||
|     corpus = dot_to_object(T, P["corpus"]) |     corpus = resolve_dot_names(_config, [P["corpus"]])[0] | ||||||
|     batcher = P["batcher"] |     batcher = P["batcher"] | ||||||
|     model = create_pretraining_model(nlp, P) |     model = create_pretraining_model(nlp, P) | ||||||
|     optimizer = P["optimizer"] |     optimizer = P["optimizer"] | ||||||
|  |  | ||||||
|  | @ -143,7 +143,7 @@ argument that connects to the shared `tok2vec` component in the pipeline. | ||||||
| 
 | 
 | ||||||
| Construct an embedding layer that separately embeds a number of lexical | Construct an embedding layer that separately embeds a number of lexical | ||||||
| attributes using hash embedding, concatenates the results, and passes it through | attributes using hash embedding, concatenates the results, and passes it through | ||||||
| a feed-forward subnetwork to build a mixed representations. The features used | a feed-forward subnetwork to build a mixed representation. The features used | ||||||
| can be configured with the `attrs` argument. The suggested attributes are | can be configured with the `attrs` argument. The suggested attributes are | ||||||
| `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account | `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account | ||||||
| some subword information, without construction a fully character-based | some subword information, without construction a fully character-based | ||||||
|  | @ -516,26 +516,54 @@ several different built-in architectures. It is recommended to experiment with | ||||||
| different architectures and settings to determine what works best on your | different architectures and settings to determine what works best on your | ||||||
| specific data and challenge. | specific data and challenge. | ||||||
| 
 | 
 | ||||||
| ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} | ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble} | ||||||
| 
 | 
 | ||||||
| > #### Example Config | > #### Example Config | ||||||
| > | > | ||||||
| > ```ini | > ```ini | ||||||
| > [model] | > [model] | ||||||
| > @architectures = "spacy.TextCatEnsemble.v1" | > @architectures = "spacy.TextCatEnsemble.v2" | ||||||
| > exclusive_classes = false |  | ||||||
| > pretrained_vectors = null |  | ||||||
| > width = 64 |  | ||||||
| > embed_size = 2000 |  | ||||||
| > conv_depth = 2 |  | ||||||
| > window_size = 1 |  | ||||||
| > ngram_size = 1 |  | ||||||
| > dropout = null |  | ||||||
| > nO = null | > nO = null | ||||||
|  | > | ||||||
|  | > [model.linear_model] | ||||||
|  | > @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | > exclusive_classes = true | ||||||
|  | > ngram_size = 1 | ||||||
|  | > no_output_layer = false | ||||||
|  | > | ||||||
|  | > [model.tok2vec] | ||||||
|  | > @architectures = "spacy.Tok2Vec.v1" | ||||||
|  | > | ||||||
|  | > [model.tok2vec.embed] | ||||||
|  | > @architectures = "spacy.MultiHashEmbed.v1" | ||||||
|  | > width = 64 | ||||||
|  | > rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||||
|  | > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||||
|  | > include_static_vectors = false | ||||||
|  | > | ||||||
|  | > [model.tok2vec.encode] | ||||||
|  | > @architectures = "spacy.MaxoutWindowEncoder.v1" | ||||||
|  | > width = ${model.tok2vec.embed.width} | ||||||
|  | > window_size = 1 | ||||||
|  | > maxout_pieces = 3 | ||||||
|  | > depth = 2 | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| Stacked ensemble of a bag-of-words model and a neural network model. The neural | Stacked ensemble of a linear bag-of-words model and a neural network model. The | ||||||
| network has an internal CNN Tok2Vec layer and uses attention. | neural network is built upon a Tok2Vec layer and uses attention. The setting for | ||||||
|  | whether or not this model should cater for multi-label classification, is taken | ||||||
|  | from the linear model, where it is stored in `model.attrs["multi_label"]`. | ||||||
|  | 
 | ||||||
|  | | Name           | Description                                                                                                                                                                                    | | ||||||
|  | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | | `linear_model` | The linear bag-of-words model. ~~Model[List[Doc], Floats2d]~~                                                                                                                                  | | ||||||
|  | | `tok2vec`      | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     | | ||||||
|  | | `nO`           | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
|  | | **CREATES**    | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
|  | 
 | ||||||
|  | <Accordion title="spacy.TextCatEnsemble.v1 definition" spaced> | ||||||
|  | 
 | ||||||
|  | The v1 was functionally similar, but used an internal `tok2vec` instead of taking it as argument. | ||||||
| 
 | 
 | ||||||
| | Name                 | Description                                                                                                                                                                                    | | | Name                 | Description                                                                                                                                                                                    | | ||||||
| | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
|  | @ -550,6 +578,8 @@ network has an internal CNN Tok2Vec layer and uses attention. | ||||||
| | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | ||||||
| | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               | | ||||||
| 
 | 
 | ||||||
|  | </Accordion> | ||||||
|  | 
 | ||||||
| ### spacy.TextCatCNN.v1 {#TextCatCNN} | ### spacy.TextCatCNN.v1 {#TextCatCNN} | ||||||
| 
 | 
 | ||||||
| > #### Example Config | > #### Example Config | ||||||
|  |  | ||||||
|  | @ -683,6 +683,7 @@ The L2 norm of the document's vector representation. | ||||||
| | `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   | | | `user_hooks`                         | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~                                                   | | ||||||
| | `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           | | | `user_token_hooks`                   | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~                                           | | ||||||
| | `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            | | | `user_span_hooks`                    | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~                                            | | ||||||
|  | | `has_unknown_spaces`                 | Whether the document was constructed without known spacing between tokens (typically when created from gold tokenization). ~~bool~~         | | ||||||
| | `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               | | | `_`                                  | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~               | | ||||||
| 
 | 
 | ||||||
| ## Serialization fields {#serialization-fields} | ## Serialization fields {#serialization-fields} | ||||||
|  |  | ||||||
|  | @ -68,6 +68,8 @@ Scores the tokenization: | ||||||
| - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token | - `token_p`, `token_r`, `token_f`: precision, recall and F-score for token | ||||||
|   character spans |   character spans | ||||||
| 
 | 
 | ||||||
|  | Docs with `has_unknown_spaces` are skipped during scoring. | ||||||
|  | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
| > ```python | > ```python | ||||||
|  | @ -81,7 +83,8 @@ Scores the tokenization: | ||||||
| 
 | 
 | ||||||
| ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} | ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} | ||||||
| 
 | 
 | ||||||
| Scores a single token attribute. | Scores a single token attribute. Tokens with missing values in the reference doc | ||||||
|  | are skipped during scoring. | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  | @ -90,20 +93,22 @@ Scores a single token attribute. | ||||||
| > print(scores["pos_acc"]) | > print(scores["pos_acc"]) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name           | Description                                                                                                                                                   | | | Name             | Description                                                                                                                                                   | | ||||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||||
| | `attr`         | The attribute to score. ~~str~~                                                                                                                               | | | `attr`           | The attribute to score. ~~str~~                                                                                                                               | | ||||||
| | _keyword-only_ |                                                                                                                                                               | | | _keyword-only_   |                                                                                                                                                               | | ||||||
| | `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | | `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||||
| | **RETURNS**    | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          | | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        | | ||||||
|  | | **RETURNS**      | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~                                                                                          | | ||||||
| 
 | 
 | ||||||
| ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} | ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} | ||||||
| 
 | 
 | ||||||
| Scores a single token attribute per feature for a token attribute in the | Scores a single token attribute per feature for a token attribute in the | ||||||
| Universal Dependencies | Universal Dependencies | ||||||
| [FEATS](https://universaldependencies.org/format.html#morphological-annotation) | [FEATS](https://universaldependencies.org/format.html#morphological-annotation) | ||||||
| format. | format. Tokens with missing values in the reference doc are skipped during | ||||||
|  | scoring. | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  | @ -112,13 +117,14 @@ format. | ||||||
| > print(scores["morph_per_feat"]) | > print(scores["morph_per_feat"]) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name           | Description                                                                                                                                                   | | | Name             | Description                                                                                                                                                   | | ||||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||||
| | `attr`         | The attribute to score. ~~str~~                                                                                                                               | | | `attr`           | The attribute to score. ~~str~~                                                                                                                               | | ||||||
| | _keyword-only_ |                                                                                                                                                               | | | _keyword-only_   |                                                                                                                                                               | | ||||||
| | `getter`       | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | | `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||||
| | **RETURNS**    | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           | | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        | | ||||||
|  | | **RETURNS**      | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~                                           | | ||||||
| 
 | 
 | ||||||
| ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} | ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} | ||||||
| 
 | 
 | ||||||
|  | @ -131,17 +137,19 @@ Returns PRF scores for labeled or unlabeled spans. | ||||||
| > print(scores["ents_f"]) | > print(scores["ents_f"]) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name           | Description                                                                                                                                                                                 | | | Name             | Description                                                                                                                                                                                 | | ||||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `examples`     | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         | | | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         | | ||||||
| | `attr`         | The attribute to score. ~~str~~                                                                                                                                                             | | | `attr`           | The attribute to score. ~~str~~                                                                                                                                                             | | ||||||
| | _keyword-only_ |                                                                                                                                                                                             | | | _keyword-only_   |                                                                                                                                                                                             | | ||||||
| | `getter`       | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  | | | `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  | | ||||||
| | **RETURNS**    | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | | `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~      | | ||||||
|  | | **RETURNS**      | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ | | ||||||
| 
 | 
 | ||||||
| ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} | ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} | ||||||
| 
 | 
 | ||||||
| Calculate the UAS, LAS, and LAS per type scores for dependency parses. | Calculate the UAS, LAS, and LAS per type scores for dependency parses. Tokens | ||||||
|  | with missing values for the `attr` (typically `dep`) are skipped during scoring. | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  | @ -160,29 +168,40 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses. | ||||||
| > print(scores["dep_uas"], scores["dep_las"]) | > print(scores["dep_uas"], scores["dep_las"]) | ||||||
| > ``` | > ``` | ||||||
| 
 | 
 | ||||||
| | Name            | Description                                                                                                                                                   | | | Name             | Description                                                                                                                                                   | | ||||||
| | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||||
| | `examples`      | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | | `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                           | | ||||||
| | `attr`          | The attribute to score. ~~str~~                                                                                                                               | | | `attr`           | The attribute to score. ~~str~~                                                                                                                               | | ||||||
| | _keyword-only_  |                                                                                                                                                               | | | _keyword-only_   |                                                                                                                                                               | | ||||||
| | `getter`        | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | | `getter`         | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ | | ||||||
| | `head_attr`     | The attribute containing the head token. ~~str~~                                                                                                              | | | `head_attr`      | The attribute containing the head token. ~~str~~                                                                                                              | | ||||||
| | `head_getter`   | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              | | | `head_getter`    | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~              | | ||||||
| | `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            | | | `ignore_labels`  | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~                                                                                            | | ||||||
| | **RETURNS**     | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      | | | `missing_values` | Attribute values to treat as missing annotation in the reference annotation. Defaults to `{0, None, ""}`. ~~Set[Any]~~                                        | | ||||||
|  | | **RETURNS**      | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                      | | ||||||
| 
 | 
 | ||||||
| ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} | ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} | ||||||
| 
 | 
 | ||||||
| Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict | Calculate PRF and ROC AUC scores for a doc-level attribute that is a dict | ||||||
| containing scores for each label like `Doc.cats`. The reported overall score | containing scores for each label like `Doc.cats`. The returned dictionary | ||||||
| depends on the scorer settings: | contains the following scores: | ||||||
| 
 | 
 | ||||||
| 1. **all:** `{attr}_score` (one of `{attr}_f` / `{attr}_macro_f` / | - `{attr}_micro_p`, `{attr}_micro_r` and `{attr}_micro_f`: each instance across | ||||||
|    `{attr}_macro_auc`), `{attr}_score_desc` (text description of the overall |   each label is weighted equally | ||||||
|    score), `{attr}_f_per_type`, `{attr}_auc_per_type` | - `{attr}_macro_p`, `{attr}_macro_r` and `{attr}_macro_f`: the average values | ||||||
| 2. **binary exclusive with positive label:** `{attr}_p`, `{attr}_r`, `{attr}_f` |   across evaluations per label | ||||||
| 3. **3+ exclusive classes**, macro-averaged F-score: `{attr}_macro_f`; | - `{attr}_f_per_type` and `{attr}_auc_per_type`: each contains a dictionary of | ||||||
| 4. **multilabel**, macro-averaged AUC: `{attr}_macro_auc` |   scores, keyed by label | ||||||
|  | - A final `{attr}_score` and corresponding `{attr}_score_desc` (text | ||||||
|  |   description) | ||||||
|  | 
 | ||||||
|  | The reported `{attr}_score` depends on the classification properties: | ||||||
|  | 
 | ||||||
|  | - **binary exclusive with positive label:** `{attr}_score` is set to the F-score | ||||||
|  |   of the positive label | ||||||
|  | - **3+ exclusive classes**, macro-averaged F-score: | ||||||
|  |   `{attr}_score = {attr}_macro_f` | ||||||
|  | - **multilabel**, macro-averaged AUC: `{attr}_score = {attr}_macro_auc` | ||||||
| 
 | 
 | ||||||
| > #### Example | > #### Example | ||||||
| > | > | ||||||
|  |  | ||||||
|  | @ -115,7 +115,7 @@ print(french_fries, "<->", burgers, french_fries.similarity(burgers)) | ||||||
| 
 | 
 | ||||||
| Computing similarity scores can be helpful in many situations, but it's also | Computing similarity scores can be helpful in many situations, but it's also | ||||||
| important to maintain **realistic expectations** about what information it can | important to maintain **realistic expectations** about what information it can | ||||||
| provide. Words can be related to each over in many ways, so a single | provide. Words can be related to each other in many ways, so a single | ||||||
| "similarity" score will always be a **mix of different signals**, and vectors | "similarity" score will always be a **mix of different signals**, and vectors | ||||||
| trained on different data can produce very different results that may not be | trained on different data can produce very different results that may not be | ||||||
| useful for your purpose. Here are some important considerations to keep in mind: | useful for your purpose. Here are some important considerations to keep in mind: | ||||||
|  |  | ||||||
|  | @ -130,16 +130,31 @@ factory = "textcat" | ||||||
| labels = [] | labels = [] | ||||||
| 
 | 
 | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatEnsemble.v1" | @architectures = "spacy.TextCatEnsemble.v2" | ||||||
| exclusive_classes = false |  | ||||||
| pretrained_vectors = null |  | ||||||
| width = 64 |  | ||||||
| conv_depth = 2 |  | ||||||
| embed_size = 2000 |  | ||||||
| window_size = 1 |  | ||||||
| ngram_size = 1 |  | ||||||
| dropout = 0 |  | ||||||
| nO = null | nO = null | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.tok2vec] | ||||||
|  | @architectures = "spacy.Tok2Vec.v1" | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.tok2vec.embed] | ||||||
|  | @architectures = "spacy.MultiHashEmbed.v1" | ||||||
|  | width = 64 | ||||||
|  | rows = [2000, 2000, 1000, 1000, 1000, 1000] | ||||||
|  | attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"] | ||||||
|  | include_static_vectors = false | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.tok2vec.encode] | ||||||
|  | @architectures = "spacy.MaxoutWindowEncoder.v1" | ||||||
|  | width = ${components.textcat.model.tok2vec.embed.width} | ||||||
|  | window_size = 1 | ||||||
|  | maxout_pieces = 3 | ||||||
|  | depth = 2 | ||||||
|  | 
 | ||||||
|  | [components.textcat.model.linear_model] | ||||||
|  | @architectures = "spacy.TextCatBOW.v1" | ||||||
|  | exclusive_classes = false | ||||||
|  | ngram_size = 1 | ||||||
|  | no_output_layer = false | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
| spaCy has two additional built-in `textcat` architectures, and you can easily | spaCy has two additional built-in `textcat` architectures, and you can easily | ||||||
|  |  | ||||||
|  | @ -1244,15 +1244,10 @@ labels = [] | ||||||
| # This function is created and then passed to the "textcat" component as | # This function is created and then passed to the "textcat" component as | ||||||
| # the argument "model" | # the argument "model" | ||||||
| [components.textcat.model] | [components.textcat.model] | ||||||
| @architectures = "spacy.TextCatEnsemble.v1" | @architectures = "spacy.TextCatBOW.v1" | ||||||
| exclusive_classes = false | exclusive_classes = false | ||||||
| pretrained_vectors = null |  | ||||||
| width = 64 |  | ||||||
| conv_depth = 2 |  | ||||||
| embed_size = 2000 |  | ||||||
| window_size = 1 |  | ||||||
| ngram_size = 1 | ngram_size = 1 | ||||||
| dropout = null | no_output_layer = false | ||||||
| 
 | 
 | ||||||
| [components.other_textcat] | [components.other_textcat] | ||||||
| factory = "textcat" | factory = "textcat" | ||||||
|  |  | ||||||
|  | @ -1142,7 +1142,7 @@ pattern = [ | ||||||
|     { |     { | ||||||
|         "LEFT_ID": "anchor_founded", |         "LEFT_ID": "anchor_founded", | ||||||
|         "REL_OP": ">", |         "REL_OP": ">", | ||||||
|         "RIGHT_ID": "subject", |         "RIGHT_ID": "founded_subject", | ||||||
|         "RIGHT_ATTRS": {"DEP": "nsubj"}, |         "RIGHT_ATTRS": {"DEP": "nsubj"}, | ||||||
|     } |     } | ||||||
|     # ... |     # ... | ||||||
|  | @ -1212,7 +1212,7 @@ pattern = [ | ||||||
|     { |     { | ||||||
|         "LEFT_ID": "anchor_founded", |         "LEFT_ID": "anchor_founded", | ||||||
|         "REL_OP": ">", |         "REL_OP": ">", | ||||||
|         "RIGHT_ID": "subject", |         "RIGHT_ID": "founded_subject", | ||||||
|         "RIGHT_ATTRS": {"DEP": "nsubj"}, |         "RIGHT_ATTRS": {"DEP": "nsubj"}, | ||||||
|     }, |     }, | ||||||
|     { |     { | ||||||
|  |  | ||||||
|  | @ -717,7 +717,7 @@ tabular results to a file: | ||||||
| ```python | ```python | ||||||
| ### functions.py | ### functions.py | ||||||
| import sys | import sys | ||||||
| from typing import IO, Tuple, Callable, Dict, Any | from typing import IO, Tuple, Callable, Dict, Any, Optional | ||||||
| import spacy | import spacy | ||||||
| from spacy import Language | from spacy import Language | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
|  | @ -729,7 +729,7 @@ def custom_logger(log_path): | ||||||
|         stdout: IO=sys.stdout, |         stdout: IO=sys.stdout, | ||||||
|         stderr: IO=sys.stderr |         stderr: IO=sys.stderr | ||||||
|     ) -> Tuple[Callable, Callable]: |     ) -> Tuple[Callable, Callable]: | ||||||
|         stdout.write(f"Logging to {log_path}\n") |         stdout.write(f"Logging to {log_path}\\n") | ||||||
|         log_file = Path(log_path).open("w", encoding="utf8") |         log_file = Path(log_path).open("w", encoding="utf8") | ||||||
|         log_file.write("step\\t") |         log_file.write("step\\t") | ||||||
|         log_file.write("score\\t") |         log_file.write("score\\t") | ||||||
|  |  | ||||||
|  | @ -433,14 +433,14 @@ The following methods, attributes and commands are new in spaCy v3.0. | ||||||
| | Name                                                                                                                            | Description                                                                                                                                                                                      | | | Name                                                                                                                            | Description                                                                                                                                                                                      | | ||||||
| | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||||
| | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        | | | [`Token.lex`](/api/token#attributes)                                                                                            | Access a token's [`Lexeme`](/api/lexeme).                                                                                                                                                        | | ||||||
| | [`Token.morph`](/api/token#attributes), [`Token.morph_`](/api/token#attributes)                                                 | Access a token's morphological analysis.                                                                                                                                                         | | | [`Token.morph`](/api/token#attributes)                                                                                          | Access a token's morphological analysis.                                                                                                                                                         | | ||||||
| | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         | | | [`Doc.has_annotation`](/api/doc#has_annotation)                                                                                 | Check whether a doc has annotation on a token attribute.                                                                                                                                         | | ||||||
| | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              | | | [`Language.select_pipes`](/api/language#select_pipes)                                                                           | Context manager for enabling or disabling specific pipeline components for a block.                                                                                                              | | ||||||
| | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             | | | [`Language.disable_pipe`](/api/language#disable_pipe), [`Language.enable_pipe`](/api/language#enable_pipe)                      | Disable or enable a loaded pipeline component (but don't remove it).                                                                                                                             | | ||||||
| | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          | | | [`Language.analyze_pipes`](/api/language#analyze_pipes)                                                                         | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies.                                                                                                          | | ||||||
| | [`Language.resume_training`](/api/language#resume_training)                                                                     | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting.                              | | | [`Language.resume_training`](/api/language#resume_training)                                                                     | Experimental: continue training a trained pipeline and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting.                              | | ||||||
| | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component)                                  | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions.                                               | | | [`@Language.factory`](/api/language#factory), [`@Language.component`](/api/language#component)                                  | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions.                                               | | ||||||
| | [`Language.has_factory`](/api/language#has_factory)                                                                             | Check whether a component factory is registered on a language class.                                                                                                                            | | | [`Language.has_factory`](/api/language#has_factory)                                                                             | Check whether a component factory is registered on a language class.                                                                                                                             | | ||||||
| | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta)       | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name.                                                                                       | | | [`Language.get_factory_meta`](/api/language#get_factory_meta), [`Language.get_pipe_meta`](/api/language#get_factory_meta)       | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name.                                                                                       | | ||||||
| | [`Language.config`](/api/language#config)                                                                                       | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | | [`Language.config`](/api/language#config)                                                                                       | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | ||||||
| | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes)                       | All available components and component names, including disabled components that are not run as part of the pipeline.                                                                            | | | [`Language.components`](/api/language#attributes), [`Language.component_names`](/api/language#attributes)                       | All available components and component names, including disabled components that are not run as part of the pipeline.                                                                            | | ||||||
|  | @ -1032,9 +1032,9 @@ change your names and imports: | ||||||
| Thanks to everyone who's been contributing to the spaCy ecosystem by developing | Thanks to everyone who's been contributing to the spaCy ecosystem by developing | ||||||
| and maintaining one of the many awesome [plugins and extensions](/universe). | and maintaining one of the many awesome [plugins and extensions](/universe). | ||||||
| We've tried to make it as easy as possible for you to upgrade your packages for | We've tried to make it as easy as possible for you to upgrade your packages for | ||||||
| spaCy v3.0. The most common use case for plugins is providing pipeline components | spaCy v3.0. The most common use case for plugins is providing pipeline | ||||||
| and extension attributes. When migrating your plugin, double-check the | components and extension attributes. When migrating your plugin, double-check | ||||||
| following: | the following: | ||||||
| 
 | 
 | ||||||
| - Use the [`@Language.factory`](/api/language#factory) decorator to register | - Use the [`@Language.factory`](/api/language#factory) decorator to register | ||||||
|   your component and assign it a name. This allows users to refer to your |   your component and assign it a name. This allows users to refer to your | ||||||
|  |  | ||||||
|  | @ -257,7 +257,7 @@ output_path.open("w", encoding="utf-8").write(svg) | ||||||
| Since each visualization is generated as a separate SVG, exporting `.svg` files | Since each visualization is generated as a separate SVG, exporting `.svg` files | ||||||
| only works if you're rendering **one single doc** at a time. (This makes sense – | only works if you're rendering **one single doc** at a time. (This makes sense – | ||||||
| after all, each visualization should be a standalone graphic.) So instead of | after all, each visualization should be a standalone graphic.) So instead of | ||||||
| rendering all `Doc`s at one, loop over them and export them separately. | rendering all `Doc`s at once, loop over them and export them separately. | ||||||
| 
 | 
 | ||||||
| </Infobox> | </Infobox> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -120,7 +120,7 @@ function formatAccuracy(data) { | ||||||
|                 ? null |                 ? null | ||||||
|                 : { |                 : { | ||||||
|                       label, |                       label, | ||||||
|                       value: value.toFixed(2), |                       value: (value * 100).toFixed(2), | ||||||
|                       help: MODEL_META[label], |                       help: MODEL_META[label], | ||||||
|                   } |                   } | ||||||
|         }) |         }) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user