From 0bc214c1028bbc33c101c7cc48c3f1a2dff6c663 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2020 16:11:33 +0200 Subject: [PATCH 01/16] Fix pull --- spacy/cli/project/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py index 3119d3a12..26676d5b3 100644 --- a/spacy/cli/project/pull.py +++ b/spacy/cli/project/pull.py @@ -51,7 +51,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): update_lockfile(project_dir, cmd) # We remove the command from the list here, and break, so that # we iterate over the loop again. - commands.remove(i) + commands.pop(i) break else: # If we didn't break the for loop, break the while loop. From d0ef4a4cf5f3d2db1e6624634731ac09b2eeda42 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 16:42:13 +0200 Subject: [PATCH 02/16] Prevent division by zero in score weights --- spacy/tests/pipeline/test_pipe_factories.py | 5 +++-- spacy/util.py | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py index 4c197005e..07648024c 100644 --- a/spacy/tests/pipeline/test_pipe_factories.py +++ b/spacy/tests/pipeline/test_pipe_factories.py @@ -345,12 +345,13 @@ def test_language_factories_invalid(): [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}], {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25}, ), - ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},), + ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}), + ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}), ], ) def test_language_factories_combine_score_weights(weights, expected): result = combine_score_weights(weights) - assert sum(result.values()) in (0.99, 1.0) + assert sum(result.values()) in (0.99, 1.0, 0.0) assert result == expected diff --git a/spacy/util.py b/spacy/util.py index 709da8d29..ad3298651 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1233,7 +1233,10 @@ def combine_score_weights( # components. total = sum(w_dict.values()) for key, value in w_dict.items(): - weight = round(value / total / len(all_weights), 2) + if total == 0: + weight = 0.0 + else: + weight = round(value / total / len(all_weights), 2) result[key] = result.get(key, 0.0) + weight return result From c7eedd3534f551d5d23b0dfddc5e2be603780ddd Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 24 Sep 2020 16:53:59 +0200 Subject: [PATCH 03/16] updates to NEL functionality (#6132) * NEL: read sentences and ents from reference * fiddling with sent_start annotations * add KB serialization test * KB write additional file with strings.json * score_links function to calculate NEL P/R/F * formatting * documentation --- spacy/errors.py | 4 +- spacy/kb.pyx | 39 ++++-- spacy/pipeline/entity_linker.py | 26 +++- spacy/scorer.py | 68 ++++++++++ spacy/tests/pipeline/test_entity_linker.py | 121 +++++++++++++++--- .../tests/{ => training}/test_new_example.py | 19 +++ website/docs/api/entitylinker.md | 15 +++ website/docs/api/scorer.md | 23 ++++ 8 files changed, 273 insertions(+), 42 deletions(-) rename spacy/tests/{ => training}/test_new_example.py (91%) diff --git a/spacy/errors.py b/spacy/errors.py index 6fdf8cb57..50d2fea5f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -517,8 +517,8 @@ class Errors: "instead.") E927 = ("Can't write to frozen list Maybe you're trying to modify a computed " "property or default function argument?") - E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " - "provided argument {loc} is an existing directory.") + E928 = ("A 'KnowledgeBase' can only be serialized to/from from a directory, " + "but the provided argument {loc} points to a file.") E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " "not seem to exist.") E930 = ("Received invalid get_examples callback in {name}.begin_training. " diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ff5382c24..bdf652766 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -10,6 +10,8 @@ from libcpp.vector cimport vector from pathlib import Path import warnings +from spacy.strings import StringStore + from spacy import util from .typedefs cimport hash_t @@ -83,6 +85,9 @@ cdef class KnowledgeBase: DOCS: https://nightly.spacy.io/api/kb """ + contents_loc = "contents" + strings_loc = "strings.json" + def __init__(self, Vocab vocab, entity_vector_length): """Create a KnowledgeBase.""" self.mem = Pool() @@ -319,15 +324,29 @@ cdef class KnowledgeBase: return 0.0 - def to_disk(self, path): path = util.ensure_path(path) - if path.is_dir(): + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): raise ValueError(Errors.E928.format(loc=path)) - if not path.parent.exists(): - path.parent.mkdir(parents=True) + self.write_contents(path / self.contents_loc) + self.vocab.strings.to_disk(path / self.strings_loc) - cdef Writer writer = Writer(path) + def from_disk(self, path): + path = util.ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + self.read_contents(path / self.contents_loc) + kb_strings = StringStore() + kb_strings.from_disk(path / self.strings_loc) + for string in kb_strings: + self.vocab.strings.add(string) + + def write_contents(self, file_path): + cdef Writer writer = Writer(file_path) writer.write_header(self.get_size_entities(), self.entity_vector_length) # dumping the entity vectors in their original order @@ -366,13 +385,7 @@ cdef class KnowledgeBase: writer.close() - def from_disk(self, path): - path = util.ensure_path(path) - if path.is_dir(): - raise ValueError(Errors.E928.format(loc=path)) - if not path.exists(): - raise ValueError(Errors.E929.format(loc=path)) - + def read_contents(self, file_path): cdef hash_t entity_hash cdef hash_t alias_hash cdef int64_t entry_index @@ -382,7 +395,7 @@ cdef class KnowledgeBase: cdef AliasC alias cdef float vector_element - cdef Reader reader = Reader(path) + cdef Reader reader = Reader(file_path) # STEP 0: load header and initialize KB cdef int64_t nr_entities diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 1debadd82..fec53c77a 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -16,6 +16,7 @@ from ..training import Example, validate_examples from ..errors import Errors, Warnings from ..util import SimpleFrozenList from .. import util +from ..scorer import Scorer default_model_config = """ @@ -47,6 +48,8 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "incl_context": True, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, }, + scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"], + default_score_weights={"nel_micro_f": 1.0}, ) def make_entity_linker( nlp: Language, @@ -209,12 +212,11 @@ class EntityLinker(Pipe): # it does run the model twice :( predictions = self.model.predict(docs) for eg in examples: - sentences = [s for s in eg.predicted.sents] + sentences = [s for s in eg.reference.sents] kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.predicted.ents: - kb_id = kb_ids[ - ent.start - ] # KB ID of the first token is the same as the whole span + for ent in eg.reference.ents: + # KB ID of the first token is the same as the whole span + kb_id = kb_ids[ent.start] if kb_id: try: # find the sentence in the list of sentences. @@ -253,7 +255,7 @@ class EntityLinker(Pipe): entity_encodings = [] for eg in examples: kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.predicted.ents: + for ent in eg.reference.ents: kb_id = kb_ids[ent.start] if kb_id: entity_encoding = self.kb.get_vector(kb_id) @@ -415,6 +417,18 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = kb_id + def score(self, examples, **kwargs): + """Score a batch of examples. + + examples (Iterable[Example]): The examples to score. + RETURNS (Dict[str, Any]): The scores. + + DOCS TODO: https://nightly.spacy.io/api/entity_linker#score + """ + validate_examples(examples, "EntityLinker.score") + return Scorer.score_links(examples, negative_labels=[self.NIL]) + + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: diff --git a/spacy/scorer.py b/spacy/scorer.py index c50de3d43..cd3b013cd 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -451,6 +451,74 @@ class Scorer: results[f"{attr}_score_desc"] = "macro AUC" return results + @staticmethod + def score_links( + examples: Iterable[Example], *, negative_labels: Iterable[str] + ) -> Dict[str, Any]: + """Returns PRF for predicted links on the entity level. + To disentangle the performance of the NEL from the NER, + this method only evaluates NEL links for entities that overlap + between the gold reference and the predictions. + + examples (Iterable[Example]): Examples to score + negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL") + RETURNS (Dict[str, Any]): A dictionary containing the scores. + + DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links + """ + f_per_type = {} + for example in examples: + gold_ent_by_offset = {} + for gold_ent in example.reference.ents: + gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent + + for pred_ent in example.predicted.ents: + gold_span = gold_ent_by_offset.get( + (pred_ent.start_char, pred_ent.end_char), None + ) + label = gold_span.label_ + if not label in f_per_type: + f_per_type[label] = PRFScore() + gold = gold_span.kb_id_ + # only evaluating entities that overlap between gold and pred, + # to disentangle the performance of the NEL from the NER + if gold is not None: + pred = pred_ent.kb_id_ + if gold in negative_labels and pred in negative_labels: + # ignore true negatives + pass + elif gold == pred: + f_per_type[label].tp += 1 + elif gold in negative_labels: + f_per_type[label].fp += 1 + elif pred in negative_labels: + f_per_type[label].fn += 1 + else: + # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN + f_per_type[label].fp += 1 + f_per_type[label].fn += 1 + micro_prf = PRFScore() + for label_prf in f_per_type.values(): + micro_prf.tp += label_prf.tp + micro_prf.fn += label_prf.fn + micro_prf.fp += label_prf.fp + n_labels = len(f_per_type) + 1e-100 + macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels + macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels + macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels + results = { + f"nel_score": micro_prf.fscore, + f"nel_score_desc": "micro F", + f"nel_micro_p": micro_prf.precision, + f"nel_micro_r": micro_prf.recall, + f"nel_micro_f": micro_prf.fscore, + f"nel_macro_p": macro_p, + f"nel_macro_r": macro_r, + f"nel_macro_f": macro_f, + f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, + } + return results + @staticmethod def score_deps( examples: Iterable[Example], diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 88e0646b3..878f41a28 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -2,8 +2,10 @@ from typing import Callable, Iterable import pytest from spacy.kb import KnowledgeBase, get_candidates, Candidate +from spacy.vocab import Vocab from spacy import util, registry +from spacy.scorer import Scorer from spacy.training import Example from spacy.lang.en import English from spacy.tests.util import make_tempdir @@ -151,22 +153,15 @@ def test_kb_serialize(nlp): # normal read-write behaviour mykb.to_disk(d / "kb") mykb.from_disk(d / "kb") - mykb.to_disk(d / "kb.file") - mykb.from_disk(d / "kb.file") mykb.to_disk(d / "new" / "kb") mykb.from_disk(d / "new" / "kb") # allow overwriting an existing file - mykb.to_disk(d / "kb.file") - with pytest.raises(ValueError): - # can not write to a directory - mykb.to_disk(d) - with pytest.raises(ValueError): - # can not read from a directory - mykb.from_disk(d) + mykb.to_disk(d / "kb") with pytest.raises(ValueError): # can not read from an unknown file mykb.from_disk(d / "unknown" / "kb") + def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -254,6 +249,41 @@ def test_el_pipe_configuration(nlp): assert doc[2].ent_kb_id_ == "Q2" +def test_vocab_serialization(nlp): + """Test that string information is retained across storage""" + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + + # adding entities + q1_hash = mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) + q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) + q3_hash = mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) + + # adding aliases + douglas_hash = mykb.add_alias( + alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1] + ) + adam_hash = mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) + + candidates = mykb.get_alias_candidates("adam") + assert len(candidates) == 1 + assert candidates[0].entity == q2_hash + assert candidates[0].entity_ == "Q2" + assert candidates[0].alias == adam_hash + assert candidates[0].alias_ == "adam" + + with make_tempdir() as d: + mykb.to_disk(d / "kb") + kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1) + kb_new_vocab.from_disk(d / "kb") + + candidates = kb_new_vocab.get_alias_candidates("adam") + assert len(candidates) == 1 + assert candidates[0].entity == q2_hash + assert candidates[0].entity_ == "Q2" + assert candidates[0].alias == adam_hash + assert candidates[0].alias_ == "adam" + + def test_append_alias(nlp): """Test that we can append additional alias-entity pairs""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) @@ -377,16 +407,20 @@ def test_preserving_links_ents_2(nlp): TRAIN_DATA = [ ("Russ Cochran captured his first major title with his son as caddie.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, - "entities": [(0, 12, "PERSON")]}), + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}), ("Russ Cochran his reprints include EC Comics.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, - "entities": [(0, 12, "PERSON")]}), + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}), ("Russ Cochran has been publishing comic art.", {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}, - "entities": [(0, 12, "PERSON")]}), + "entities": [(0, 12, "PERSON")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0]}), ("Russ Cochran was a member of University of Kentucky's golf team.", {"links": {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}}, - "entities": [(0, 12, "PERSON"), (43, 51, "LOC")]}), + "entities": [(0, 12, "PERSON"), (43, 51, "LOC")], + "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}) ] GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] # fmt: on @@ -395,16 +429,8 @@ GOLD_entities = ["Q2146908", "Q7381115", "Q7381115", "Q2146908"] def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() - nlp.add_pipe("sentencizer") vector_length = 3 - # Add a custom component to recognize "Russ Cochran" as an entity for the example training data - patterns = [ - {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} - ] - ruler = nlp.add_pipe("entity_ruler") - ruler.add_patterns(patterns) - # Convert the texts to docs to make sure we have doc.ents set for the training examples train_examples = [] for text, annotation in TRAIN_DATA: @@ -446,6 +472,16 @@ def test_overfitting_IO(): nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # Add a custom component to recognize "Russ Cochran" as an entity for the example training data + patterns = [ + {"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]} + ] + ruler = nlp.add_pipe("entity_ruler", before="entity_linker") + ruler.add_patterns(patterns) + # test the trained model predictions = [] for text, annotation in TRAIN_DATA: @@ -465,3 +501,46 @@ def test_overfitting_IO(): for ent in doc2.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities + + +def test_scorer_links(): + train_examples = [] + nlp = English() + ref1 = nlp("Julia lives in London happily.") + ref1.ents = [ + Span(ref1, 0, 1, label="PERSON", kb_id="Q2"), + Span(ref1, 3, 4, label="LOC", kb_id="Q3"), + ] + pred1 = nlp("Julia lives in London happily.") + pred1.ents = [ + Span(pred1, 0, 1, label="PERSON", kb_id="Q70"), + Span(pred1, 3, 4, label="LOC", kb_id="Q3"), + ] + train_examples.append(Example(pred1, ref1)) + + ref2 = nlp("She loves London.") + ref2.ents = [ + Span(ref2, 0, 1, label="PERSON", kb_id="Q2"), + Span(ref2, 2, 3, label="LOC", kb_id="Q13"), + ] + pred2 = nlp("She loves London.") + pred2.ents = [ + Span(pred2, 0, 1, label="PERSON", kb_id="Q2"), + Span(pred2, 2, 3, label="LOC", kb_id="NIL"), + ] + train_examples.append(Example(pred2, ref2)) + + ref3 = nlp("London is great.") + ref3.ents = [Span(ref3, 0, 1, label="LOC", kb_id="NIL")] + pred3 = nlp("London is great.") + pred3.ents = [Span(pred3, 0, 1, label="LOC", kb_id="NIL")] + train_examples.append(Example(pred3, ref3)) + + scores = Scorer().score_links(train_examples, negative_labels=["NIL"]) + assert scores["nel_f_per_type"]["PERSON"]["p"] == 1 / 2 + assert scores["nel_f_per_type"]["PERSON"]["r"] == 1 / 2 + assert scores["nel_f_per_type"]["LOC"]["p"] == 1 / 1 + assert scores["nel_f_per_type"]["LOC"]["r"] == 1 / 2 + + assert scores["nel_micro_p"] == 2 / 3 + assert scores["nel_micro_r"] == 2 / 4 diff --git a/spacy/tests/test_new_example.py b/spacy/tests/training/test_new_example.py similarity index 91% rename from spacy/tests/test_new_example.py rename to spacy/tests/training/test_new_example.py index 597809286..81207b640 100644 --- a/spacy/tests/test_new_example.py +++ b/spacy/tests/training/test_new_example.py @@ -244,3 +244,22 @@ def test_Example_from_dict_with_links_invalid(annots): predicted = Doc(vocab, words=annots["words"]) with pytest.raises(ValueError): Example.from_dict(predicted, annots) + + +def test_Example_from_dict_sentences(): + vocab = Vocab() + predicted = Doc(vocab, words=["One", "sentence", ".", "one", "more"]) + annots = {"sent_starts": [1, 0, 0, 1, 0]} + ex = Example.from_dict(predicted, annots) + assert len(list(ex.reference.sents)) == 2 + + # this currently throws an error - bug or feature? + # predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"]) + # annots = {"sent_starts": [1, 0, 0, 0, 0]} + # ex = Example.from_dict(predicted, annots) + # assert len(list(ex.reference.sents)) == 1 + + predicted = Doc(vocab, words=["One", "sentence", "not", "one", "more"]) + annots = {"sent_starts": [1, -1, 0, 0, 0]} + ex = Example.from_dict(predicted, annots) + assert len(list(ex.reference.sents)) == 1 diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md index 9cb35b487..945a1568a 100644 --- a/website/docs/api/entitylinker.md +++ b/website/docs/api/entitylinker.md @@ -225,6 +225,21 @@ pipe's entity linking model and context encoder. Delegates to | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +## EntityLinker.score {#score tag="method" new="3"} + +Score a batch of examples. + +> #### Example +> +> ```python +> scores = entity_linker.score(examples) +> ``` + +| Name | Description | +| ----------- | ---------------------------------------------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores, produced by [`Scorer.score_links`](/api/scorer#score_links) . ~~Dict[str, float]~~ | + ## EntityLinker.create_optimizer {#create_optimizer tag="method"} Create an optimizer for the pipeline component. diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index 1c0895bcf..0dbc0de33 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -206,3 +206,26 @@ depends on the scorer settings: | `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ | | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ | | **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ | + +## Scorer.score_links {#score_links tag="staticmethod" new="3"} + +Returns PRF for predicted links on the entity level. To disentangle the +performance of the NEL from the NER, this method only evaluates NEL links for +entities that overlap between the gold reference and the predictions. + +> #### Example +> +> ```python +> scores = Scorer.score_links( +> examples, +> negative_labels=["NIL", ""] +> ) +> print(scores["nel_micro_f"]) +> ``` + +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------- | +| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `negative_labels` | The string values that refer to no annotation (e.g. "NIL"). ~~Iterable[str]~~ | +| **RETURNS** | A dictionary containing the scores. ~~Dict[str, Optional[float]]~~ | From 59340606b7881928c924e4c11bc59192522fedb8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 16:54:39 +0200 Subject: [PATCH 04/16] Add option to disable Matcher errors (#6125) * Add option to disable Matcher errors * Add option to disable Matcher errors when a doc doesn't contain a particular type of annotation Minor additional change: * Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH` values * Rename suppress_errors to allow_missing Co-authored-by: Matthew Honnibal * Refactor annotation checks in Matcher and PhraseMatcher Co-authored-by: Matthew Honnibal --- spacy/errors.py | 4 ---- spacy/matcher/matcher.pyx | 25 ++++++++++++++----------- spacy/matcher/phrasematcher.pyx | 22 ++++++++++++---------- spacy/pipeline/attributeruler.py | 18 +++++++++++++----- spacy/tests/matcher/test_matcher_api.py | 3 +++ 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 50d2fea5f..4216e3936 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -401,10 +401,6 @@ class Errors: "Matcher or PhraseMatcher with the attribute {attr}. " "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " "instead of list(nlp.tokenizer.pipe()).") - E156 = ("The pipeline needs to include a parser in order to use " - "Matcher or PhraseMatcher with the attribute DEP. Try using " - "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of " - "list(nlp.tokenizer.pipe()).") E157 = ("Can't render negative values for dependency arc start or end. " "Make sure that you're passing in absolute token indices, not " "relative token offsets.\nstart: {start}, end: {end}, label: " diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index d83f58181..39c7168e4 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -195,7 +195,7 @@ cdef class Matcher: else: yield doc - def __call__(self, object doclike, *, as_spans=False): + def __call__(self, object doclike, *, as_spans=False, allow_missing=False): """Find all token sequences matching the supplied pattern. doclike (Doc or Span): The document to match over. @@ -215,16 +215,19 @@ cdef class Matcher: else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) cdef Pool tmp_pool = Pool() - if TAG in self._seen_attrs and not doc.has_annotation("TAG"): - raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) - if POS in self._seen_attrs and not doc.has_annotation("POS"): - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) - if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"): - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) - if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"): - raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) - if DEP in self._seen_attrs and not doc.has_annotation("DEP"): - raise ValueError(Errors.E156.format()) + if not allow_missing: + for attr in (TAG, POS, MORPH, LEMMA, DEP): + if attr in self._seen_attrs and not doc.has_annotation(attr): + if attr == TAG: + pipe = "tagger" + elif attr in (POS, MORPH): + pipe = "morphologizer" + elif attr == LEMMA: + pipe = "lemmatizer" + elif attr == DEP: + pipe = "parser" + error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) + raise ValueError(error_msg) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) final_matches = [] diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index b00ba157f..7e99859b5 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -186,16 +186,18 @@ cdef class PhraseMatcher: if isinstance(doc, Doc): attrs = (TAG, POS, MORPH, LEMMA, DEP) has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} - if self.attr == TAG and not has_annotation[TAG]: - raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) - if self.attr == POS and not has_annotation[POS]: - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) - if self.attr == MORPH and not has_annotation[MORPH]: - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) - if self.attr == LEMMA and not has_annotation[LEMMA]: - raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) - if self.attr == DEP and not has_annotation[DEP]: - raise ValueError(Errors.E156.format()) + for attr in attrs: + if self.attr == attr and not has_annotation[attr]: + if attr == TAG: + pipe = "tagger" + elif attr in (POS, MORPH): + pipe = "morphologizer" + elif attr == LEMMA: + pipe = "lemmatizer" + elif attr == DEP: + pipe = "parser" + error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) + raise ValueError(error_msg) if self._validate and any(has_annotation.values()) \ and self.attr not in attrs: string_attr = self.vocab.strings[self.attr] diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index f64fcbc54..0d59a1ba0 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -79,7 +79,7 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#call """ - matches = sorted(self.matcher(doc)) + matches = sorted(self.matcher(doc, allow_missing=True)) for match_id, start, end in matches: span = Span(doc, start, end, label=match_id) @@ -126,8 +126,12 @@ class AttributeRuler(Pipe): for tag, attrs in tag_map.items(): pattern = [{"TAG": tag}] attrs, morph_attrs = _split_morph_attrs(attrs) - morph = self.vocab.morphology.add(morph_attrs) - attrs["MORPH"] = self.vocab.strings[morph] + if "MORPH" not in attrs: + morph = self.vocab.morphology.add(morph_attrs) + attrs["MORPH"] = self.vocab.strings[morph] + else: + morph = self.vocab.morphology.add(attrs["MORPH"]) + attrs["MORPH"] = self.vocab.strings[morph] self.add([pattern], attrs) def load_from_morph_rules( @@ -146,8 +150,12 @@ class AttributeRuler(Pipe): pattern = [{"ORTH": word, "TAG": tag}] attrs = morph_rules[tag][word] attrs, morph_attrs = _split_morph_attrs(attrs) - morph = self.vocab.morphology.add(morph_attrs) - attrs["MORPH"] = self.vocab.strings[morph] + if "MORPH" in attrs: + morph = self.vocab.morphology.add(attrs["MORPH"]) + attrs["MORPH"] = self.vocab.strings[morph] + elif morph_attrs: + morph = self.vocab.morphology.add(morph_attrs) + attrs["MORPH"] = self.vocab.strings[morph] self.add([pattern], attrs) def add( diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 04f9585f1..c407595e5 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) + # errors can be suppressed if desired + matcher(doc2, allow_missing=True) + matcher(doc3, allow_missing=True) # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab) From 3c062b3911d70f0f9521653cac6d0a7b85bc272f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 16:55:09 +0200 Subject: [PATCH 05/16] Add MORPH handling to Matcher (#6107) * Add MORPH handling to Matcher * Add `MORPH` to `Matcher` schema * Rename `_SetMemberPredicate` to `_SetPredicate` * Add `ISSUBSET` and `ISSUPERSET` operators to `_SetPredicate` * Add special handling for normalization and conversion of morph values into sets * For other attrs, `ISSUBSET` acts like `IN` and `ISSUPERSET` only matches for 0 or 1 values * Update test * Rename to IS_SUBSET and IS_SUPERSET --- spacy/matcher/matcher.pyx | 52 +++++++---- spacy/schemas.py | 5 ++ spacy/tests/matcher/test_matcher_api.py | 100 ++++++++++++++++++++++ website/docs/api/matcher.md | 30 ++++--- website/docs/usage/rule-based-matching.md | 30 ++++--- 5 files changed, 174 insertions(+), 43 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 39c7168e4..a4d20ec55 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -17,6 +17,7 @@ from ..vocab cimport Vocab from ..tokens.doc cimport Doc, get_token_attr_for_matcher from ..tokens.span cimport Span from ..tokens.token cimport Token +from ..tokens.morphanalysis cimport MorphAnalysis from ..attrs cimport ID, attr_id_t, NULL_ATTR, ORTH, POS, TAG, DEP, LEMMA, MORPH from ..schemas import validate_token_pattern @@ -124,7 +125,7 @@ cdef class Matcher: key = self._normalize_key(key) for pattern in patterns: try: - specs = _preprocess_pattern(pattern, self.vocab.strings, + specs = _preprocess_pattern(pattern, self.vocab, self._extensions, self._extra_predicates) self.patterns.push_back(init_pattern(self.mem, key, specs)) for spec in specs: @@ -663,7 +664,7 @@ cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: return id_attr.value -def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predicates): +def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates): """This function interprets the pattern, converting the various bits of syntactic sugar before we compile it into a struct with init_pattern. @@ -678,6 +679,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi extra_predicates. """ tokens = [] + string_store = vocab.strings for spec in token_specs: if not spec: # Signifier for 'any token' @@ -688,7 +690,7 @@ def _preprocess_pattern(token_specs, string_store, extensions_table, extra_predi ops = _get_operators(spec) attr_values = _get_attr_values(spec, string_store) extensions = _get_extensions(spec, string_store, extensions_table) - predicates = _get_extra_predicates(spec, extra_predicates) + predicates = _get_extra_predicates(spec, extra_predicates, vocab) for op in ops: tokens.append((op, list(attr_values), list(extensions), list(predicates))) return tokens @@ -732,7 +734,7 @@ def _get_attr_values(spec, string_store): class _RegexPredicate: operators = ("REGEX",) - def __init__(self, i, attr, value, predicate, is_extension=False): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): self.i = i self.attr = attr self.value = re.compile(value) @@ -750,13 +752,18 @@ class _RegexPredicate: return bool(self.value.search(value)) -class _SetMemberPredicate: - operators = ("IN", "NOT_IN") +class _SetPredicate: + operators = ("IN", "NOT_IN", "IS_SUBSET", "IS_SUPERSET") - def __init__(self, i, attr, value, predicate, is_extension=False): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): self.i = i self.attr = attr - self.value = set(get_string_id(v) for v in value) + self.vocab = vocab + if self.attr == MORPH: + # normalize morph strings + self.value = set(self.vocab.morphology.add(v) for v in value) + else: + self.value = set(get_string_id(v) for v in value) self.predicate = predicate self.is_extension = is_extension self.key = (attr, self.predicate, srsly.json_dumps(value, sort_keys=True)) @@ -768,19 +775,32 @@ class _SetMemberPredicate: value = get_string_id(token._.get(self.attr)) else: value = get_token_attr_for_matcher(token.c, self.attr) + + if self.predicate in ("IS_SUBSET", "IS_SUPERSET"): + if self.attr == MORPH: + # break up MORPH into individual Feat=Val values + value = set(get_string_id(v) for v in MorphAnalysis.from_id(self.vocab, value)) + else: + # IS_SUBSET for other attrs will be equivalent to "IN" + # IS_SUPERSET will only match for other attrs with 0 or 1 values + value = set([value]) if self.predicate == "IN": return value in self.value - else: + elif self.predicate == "NOT_IN": return value not in self.value + elif self.predicate == "IS_SUBSET": + return value <= self.value + elif self.predicate == "IS_SUPERSET": + return value >= self.value def __repr__(self): - return repr(("SetMemberPredicate", self.i, self.attr, self.value, self.predicate)) + return repr(("SetPredicate", self.i, self.attr, self.value, self.predicate)) class _ComparisonPredicate: operators = ("==", "!=", ">=", "<=", ">", "<") - def __init__(self, i, attr, value, predicate, is_extension=False): + def __init__(self, i, attr, value, predicate, is_extension=False, vocab=None): self.i = i self.attr = attr self.value = value @@ -809,11 +829,13 @@ class _ComparisonPredicate: return value < self.value -def _get_extra_predicates(spec, extra_predicates): +def _get_extra_predicates(spec, extra_predicates, vocab): predicate_types = { "REGEX": _RegexPredicate, - "IN": _SetMemberPredicate, - "NOT_IN": _SetMemberPredicate, + "IN": _SetPredicate, + "NOT_IN": _SetPredicate, + "IS_SUBSET": _SetPredicate, + "IS_SUPERSET": _SetPredicate, "==": _ComparisonPredicate, "!=": _ComparisonPredicate, ">=": _ComparisonPredicate, @@ -841,7 +863,7 @@ def _get_extra_predicates(spec, extra_predicates): value_with_upper_keys = {k.upper(): v for k, v in value.items()} for type_, cls in predicate_types.items(): if type_ in value_with_upper_keys: - predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_) + predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_, vocab=vocab) # Don't create a redundant predicates. # This helps with efficiency, as we're caching the results. if predicate.key in seen_predicates: diff --git a/spacy/schemas.py b/spacy/schemas.py index eea6639d3..0c85dfe57 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -61,6 +61,8 @@ class TokenPatternString(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") IN: Optional[List[StrictStr]] = Field(None, alias="in") NOT_IN: Optional[List[StrictStr]] = Field(None, alias="not_in") + IS_SUBSET: Optional[List[StrictStr]] = Field(None, alias="is_subset") + IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") class Config: extra = "forbid" @@ -77,6 +79,8 @@ class TokenPatternNumber(BaseModel): REGEX: Optional[StrictStr] = Field(None, alias="regex") IN: Optional[List[StrictInt]] = Field(None, alias="in") NOT_IN: Optional[List[StrictInt]] = Field(None, alias="not_in") + ISSUBSET: Optional[List[StrictInt]] = Field(None, alias="issubset") + ISSUPERSET: Optional[List[StrictInt]] = Field(None, alias="issuperset") EQ: Union[StrictInt, StrictFloat] = Field(None, alias="==") NEQ: Union[StrictInt, StrictFloat] = Field(None, alias="!=") GEQ: Union[StrictInt, StrictFloat] = Field(None, alias=">=") @@ -115,6 +119,7 @@ class TokenPattern(BaseModel): lower: Optional[StringValue] = None pos: Optional[StringValue] = None tag: Optional[StringValue] = None + morph: Optional[StringValue] = None dep: Optional[StringValue] = None lemma: Optional[StringValue] = None shape: Optional[StringValue] = None diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index c407595e5..627110cdd 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -230,6 +230,106 @@ def test_matcher_set_value_operator(en_vocab): assert len(matches) == 1 +def test_matcher_subset_value_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"MORPH": {"IS_SUBSET": ["Feat=Val", "Feat2=Val2"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc)) == 3 + doc[0].morph_ = "Feat=Val" + assert len(matcher(doc)) == 3 + doc[0].morph_ = "Feat=Val|Feat2=Val2" + assert len(matcher(doc)) == 3 + doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" + assert len(matcher(doc)) == 2 + doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" + assert len(matcher(doc)) == 2 + + # IS_SUBSET acts like "IN" for attrs other than MORPH + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"IS_SUBSET": ["A", "B"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 1 + + # IS_SUBSET with an empty list matches nothing + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"IS_SUBSET": []}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 0 + + +def test_matcher_superset_value_operator(en_vocab): + matcher = Matcher(en_vocab) + pattern = [{"MORPH": {"IS_SUPERSET": ["Feat=Val", "Feat2=Val2", "Feat3=Val3"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc)) == 0 + doc[0].morph_ = "Feat=Val|Feat2=Val2" + assert len(matcher(doc)) == 0 + doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3" + assert len(matcher(doc)) == 1 + doc[0].morph_ = "Feat=Val|Feat2=Val2|Feat3=Val3|Feat4=Val4" + assert len(matcher(doc)) == 1 + + # IS_SUPERSET with more than one value only matches for MORPH + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"IS_SUPERSET": ["A", "B"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 0 + + # IS_SUPERSET with one value is the same as == + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"IS_SUPERSET": ["A"]}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 1 + + # IS_SUPERSET with an empty value matches everything + matcher = Matcher(en_vocab) + pattern = [{"TAG": {"IS_SUPERSET": []}}] + matcher.add("M", [pattern]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + doc[0].tag_ = "A" + assert len(matcher(doc)) == 3 + + +def test_matcher_morph_handling(en_vocab): + # order of features in pattern doesn't matter + matcher = Matcher(en_vocab) + pattern1 = [{"MORPH": {"IN": ["Feat1=Val1|Feat2=Val2"]}}] + pattern2 = [{"MORPH": {"IN": ["Feat2=Val2|Feat1=Val1"]}}] + matcher.add("M", [pattern1]) + matcher.add("N", [pattern2]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc)) == 0 + + doc[0].morph_ = "Feat2=Val2|Feat1=Val1" + assert len(matcher(doc)) == 2 + doc[0].morph_ = "Feat1=Val1|Feat2=Val2" + assert len(matcher(doc)) == 2 + + # multiple values are split + matcher = Matcher(en_vocab) + pattern1 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat2=Val2"]}}] + pattern2 = [{"MORPH": {"IS_SUPERSET": ["Feat1=Val1", "Feat1=Val3", "Feat2=Val2"]}}] + matcher.add("M", [pattern1]) + matcher.add("N", [pattern2]) + doc = Doc(en_vocab, words=["a", "b", "c"]) + assert len(matcher(doc)) == 0 + + doc[0].morph_ = "Feat2=Val2,Val3|Feat1=Val1" + assert len(matcher(doc)) == 1 + doc[0].morph_ = "Feat1=Val1,Val3|Feat2=Val2" + assert len(matcher(doc)) == 2 + + def test_matcher_regex(en_vocab): matcher = Matcher(en_vocab) pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}] diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md index 1f1946be5..3f7076a1c 100644 --- a/website/docs/api/matcher.md +++ b/website/docs/api/matcher.md @@ -30,20 +30,20 @@ pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute |  Description | -| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -|  `LENGTH` | The length of the token text. ~~int~~ | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | -|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | -| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | +| Attribute |  Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +|  `LENGTH` | The length of the token text. ~~int~~ | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | +| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ | Operators and quantifiers define **how often** a token pattern should be matched: @@ -79,6 +79,8 @@ it compares to another value. | -------------------------- | ------------------------------------------------------------------------------------------------------- | | `IN` | Attribute value is member of a list. ~~Any~~ | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ | +| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | ## Matcher.\_\_init\_\_ {#init tag="method"} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index 7e979b32e..256f4ccb4 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -158,20 +158,20 @@ The available token pattern keys correspond to a number of [`Token` attributes](/api/token#attributes). The supported attributes for rule-based matching are: -| Attribute |  Description | -| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -|  `LENGTH` | The length of the token text. ~~int~~ | -|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | -|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | -|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | -|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | -|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | -| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ | +| Attribute |  Description | +| ----------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `TEXT` 2.1 | The exact verbatim text of a token. ~~str~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +|  `LENGTH` | The length of the token text. ~~int~~ | +|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ | +|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ | +|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ | +|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ | +|  `POS`, `TAG`, `MORPH`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, morphological analysis, dependency label, lemma, shape. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `_` 2.1 | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ | +| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ | @@ -236,6 +236,8 @@ following rich comparison attributes are available: | -------------------------- | ------------------------------------------------------------------------------------------------------- | | `IN` | Attribute value is member of a list. ~~Any~~ | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ | +| `ISSUBSET` | Attribute values (for `MORPH`) are a subset of a list. ~~Any~~ | +| `ISSUPERSET` | Attribute values (for `MORPH`) are a superset of a list. ~~Any~~ | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ | #### Regular expressions {#regex new="2.1"} From 20b89a97176a5fc2d2c2c01e4f725f3a1d1e928b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 16:57:02 +0200 Subject: [PATCH 06/16] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 56b05257a..ea9f9f33e 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a24" +__version__ = "3.0.0a25" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 0b52b6904c78cc9e12db962d89db1ab2db38d545 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 17:10:35 +0200 Subject: [PATCH 07/16] Update entity_linker.py --- spacy/pipeline/entity_linker.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index fec53c77a..039e2a891 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -48,8 +48,11 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "incl_context": True, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, }, - scores=["nel_micro_p", "nel_micro_r", "nel_micro_f"], - default_score_weights={"nel_micro_f": 1.0}, + default_score_weights={ + "nel_micro_f": 1.0, + "nel_micro_r": None, + "nel_micro_p": None, + }, ) def make_entity_linker( nlp: Language, @@ -428,7 +431,6 @@ class EntityLinker(Pipe): validate_examples(examples, "EntityLinker.score") return Scorer.score_links(examples, negative_labels=[self.NIL]) - def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: From 26e28ed4134734dbc86fedb97339eec47282025a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 17:11:13 +0200 Subject: [PATCH 08/16] Fix combined scores if multiple components report it --- spacy/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index ad3298651..378ec2823 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1237,7 +1237,9 @@ def combine_score_weights( weight = 0.0 else: weight = round(value / total / len(all_weights), 2) - result[key] = result.get(key, 0.0) + weight + prev_weight = result.get(key, 0.0) + prev_weight = 0.0 if prev_weight is None else prev_weight + result[key] = prev_weight + weight return result From 2abb4ba9db0d0ec074a7336be8a7395da78eaaa4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2020 18:13:39 +0200 Subject: [PATCH 09/16] Make a pre-check to speed up alignment cache (#6139) * Dirty trick to fast-track alignment cache * Improve alignment cache check * Fix header * Fix align cache * Fix align logic --- spacy/training/example.pxd | 3 +++ spacy/training/example.pyx | 36 +++++++++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/spacy/training/example.pxd b/spacy/training/example.pxd index e06e36287..49e239757 100644 --- a/spacy/training/example.pxd +++ b/spacy/training/example.pxd @@ -1,4 +1,5 @@ from ..tokens.doc cimport Doc +from libc.stdint cimport uint64_t cdef class Example: @@ -7,3 +8,5 @@ cdef class Example: cdef readonly object _cached_alignment cdef readonly object _cached_words_x cdef readonly object _cached_words_y + cdef readonly uint64_t _x_sig + cdef readonly uint64_t _y_sig diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 1e7bea5df..6a9815c44 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -1,6 +1,7 @@ from collections import Iterable as IterableInstance import warnings import numpy +from murmurhash.mrmr cimport hash64 from ..tokens.doc cimport Doc from ..tokens.span cimport Span @@ -97,15 +98,36 @@ cdef class Example: @property def alignment(self): - words_x = [token.text for token in self.x] - words_y = [token.text for token in self.y] - if self._cached_alignment is None or \ - words_x != self._cached_words_x or \ - words_y != self._cached_words_y: - self._cached_alignment = Alignment.from_strings(words_x, words_y) + x_sig = hash64(self.x.c, sizeof(self.x.c[0]) * self.x.length, 0) + y_sig = hash64(self.y.c, sizeof(self.y.c[0]) * self.y.length, 0) + if self._cached_alignment is None: + words_x = [token.text for token in self.x] + words_y = [token.text for token in self.y] + self._x_sig = x_sig + self._y_sig = y_sig self._cached_words_x = words_x self._cached_words_y = words_y - return self._cached_alignment + self._cached_alignment = Alignment.from_strings(words_x, words_y) + return self._cached_alignment + elif self._x_sig == x_sig and self._y_sig == y_sig: + # If we have a cached alignment, check whether the cache is invalid + # due to retokenization. To make this check fast in loops, we first + # check a hash of the TokenC arrays. + return self._cached_alignment + else: + words_x = [token.text for token in self.x] + words_y = [token.text for token in self.y] + if words_x == self._cached_words_x and words_y == self._cached_words_y: + self._x_sig = x_sig + self._y_sig = y_sig + return self._cached_alignment + else: + self._cached_alignment = Alignment.from_strings(words_x, words_y) + self._cached_words_x = words_x + self._cached_words_y = words_y + self._x_sig = x_sig + self._y_sig = y_sig + return self._cached_alignment def get_aligned(self, field, as_string=False): """Return an aligned array for a token attribute.""" From 16475528f735114370d2db48b576106b1a6451e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2020 20:38:57 +0200 Subject: [PATCH 10/16] Fix skipped documents in entity scorer (#6137) * Fix skipped documents in entity scorer * Add back the skipping of unannotated entities * Update spacy/scorer.py * Use more specific NER scorer * Fix import * Fix get_ner_prf * Add scorer * Fix scorer Co-authored-by: Ines Montani --- spacy/pipeline/ner.pyx | 15 ++++++++-- spacy/scorer.py | 64 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx index c9b0a5031..fc0dda40d 100644 --- a/spacy/pipeline/ner.pyx +++ b/spacy/pipeline/ner.pyx @@ -6,7 +6,7 @@ from .transition_parser cimport Parser from ._parser_internals.ner cimport BiluoPushDown from ..language import Language -from ..scorer import Scorer +from ..scorer import get_ner_prf, PRFScore from ..training import validate_examples @@ -117,9 +117,18 @@ cdef class EntityRecognizer(Parser): """Score a batch of examples. examples (Iterable[Example]): The examples to score. - RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_spans. + RETURNS (Dict[str, Any]): The NER precision, recall and f-scores. DOCS: https://nightly.spacy.io/api/entityrecognizer#score """ validate_examples(examples, "EntityRecognizer.score") - return Scorer.score_spans(examples, "ents", **kwargs) + score_per_type = get_ner_prf(examples) + totals = PRFScore() + for prf in score_per_type.values(): + totals += prf + return { + "ents_p": totals.precision, + "ents_r": totals.recall, + "ents_f": totals.fscore, + "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, + } diff --git a/spacy/scorer.py b/spacy/scorer.py index cd3b013cd..c1795847d 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,5 +1,6 @@ from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING import numpy as np +from collections import defaultdict from .training import Example from .tokens import Token, Doc, Span @@ -23,6 +24,19 @@ class PRFScore: self.fp = 0 self.fn = 0 + def __iadd__(self, other): + self.tp += other.tp + self.fp += other.fp + self.fn += other.fn + return self + + def __add__(self, other): + return PRFScore( + tp=self.tp+other.tp, + fp=self.fp+other.fp, + fn=self.fn+other.fn + ) + def score_set(self, cand: set, gold: set) -> None: self.tp += len(cand.intersection(gold)) self.fp += len(cand - gold) @@ -295,20 +309,19 @@ class Scorer: # Find all predidate labels, for all and per type gold_spans = set() pred_spans = set() - # Special case for ents: - # If we have missing values in the gold, we can't easily tell - # whether our NER predictions are true. - # It seems bad but it's what we've always done. - if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc): - continue for span in getter(gold_doc, attr): gold_span = (span.label_, span.start, span.end - 1) gold_spans.add(gold_span) gold_per_type[span.label_].add((span.label_, span.start, span.end - 1)) pred_per_type = {label: set() for label in labels} - for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)): - pred_spans.add((span.label_, span.start, span.end - 1)) - pred_per_type[span.label_].add((span.label_, span.start, span.end - 1)) + align_x2y = example.alignment.x2y + for pred_span in getter(pred_doc, attr): + indices = align_x2y[pred_span.start : pred_span.end].dataXd.ravel() + if len(indices): + g_span = gold_doc[indices[0] : indices[-1]] + span = (pred_span.label_, indices[0], indices[-1]) + pred_spans.add(span) + pred_per_type[pred_span.label_].add(span) # Scores per label for k, v in score_per_type.items(): if k in pred_per_type: @@ -613,6 +626,39 @@ class Scorer: } +def get_ner_prf(examples: Iterable[Example]) -> Dict[str, PRFScore]: + """Compute per-entity PRFScore objects for a sequence of examples. The + results are returned as a dictionary keyed by the entity type. You can + add the PRFScore objects to get micro-averaged total. + """ + scores = defaultdict(PRFScore) + for eg in examples: + if not eg.y.has_annotation("ENT_IOB"): + continue + golds = {(e.label_, e.start, e.end) for e in eg.y.ents} + align_x2y = eg.alignment.x2y + preds = set() + for pred_ent in eg.x.ents: + if pred_ent.label_ not in scores: + scores[pred_ent.label_] = PRFScore() + indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel() + if len(indices): + g_span = eg.y[indices[0] : indices[-1] + 1] + # Check we aren't missing annotation on this span. If so, + # our prediction is neither right nor wrong, we just + # ignore it. + if all(token.ent_iob != 0 for token in g_span): + key = (pred_ent.label_, indices[0], indices[-1] + 1) + if key in golds: + scores[pred_ent.label_].tp += 1 + golds.remove(key) + else: + scores[pred_ent.label_].fp += 1 + for label, start, end in golds: + scores[label].fn += 1 + return scores + + ############################################################################# # # The following implementation of roc_auc_score() is adapted from From 2aa4d65734dec26d09d3326bf0498a2dafd54817 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 20:41:09 +0200 Subject: [PATCH 11/16] Update docs [ci skip] --- website/docs/api/entityrecognizer.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md index 8af73f44b..6d710f425 100644 --- a/website/docs/api/entityrecognizer.md +++ b/website/docs/api/entityrecognizer.md @@ -242,10 +242,10 @@ Score a batch of examples. > scores = ner.score(examples) > ``` -| Name | Description | -| ----------- | ---------------------------------------------------------------------------------------------------------------------- | -| `examples` | The examples to score. ~~Iterable[Example]~~ | -| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ | +| Name | Description | +| ----------- | --------------------------------------------------------- | +| `examples` | The examples to score. ~~Iterable[Example]~~ | +| **RETURNS** | The scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ | ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} From 93d7ff309fba4faa805ca105b56a04daefa77f5c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 24 Sep 2020 21:05:27 +0200 Subject: [PATCH 12/16] Remove print --- spacy/training/example.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 6a9815c44..f2c78203a 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -310,7 +310,6 @@ def _annot2array(vocab, tok_annot, doc_annot): def _add_entities_to_doc(doc, ner_data): - print(ner_data) if ner_data is None: return elif ner_data == []: From 50f20cf7224edefbfa789755a1415841e6cd647b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 25 Sep 2020 08:21:30 +0200 Subject: [PATCH 13/16] Revert changes to Scorer.score_spans --- spacy/scorer.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index c1795847d..b2f97e163 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -314,14 +314,9 @@ class Scorer: gold_spans.add(gold_span) gold_per_type[span.label_].add((span.label_, span.start, span.end - 1)) pred_per_type = {label: set() for label in labels} - align_x2y = example.alignment.x2y - for pred_span in getter(pred_doc, attr): - indices = align_x2y[pred_span.start : pred_span.end].dataXd.ravel() - if len(indices): - g_span = gold_doc[indices[0] : indices[-1]] - span = (pred_span.label_, indices[0], indices[-1]) - pred_spans.add(span) - pred_per_type[pred_span.label_].add(span) + for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)): + pred_spans.add((span.label_, span.start, span.end - 1)) + pred_per_type[span.label_].add((span.label_, span.start, span.end - 1)) # Scores per label for k, v in score_per_type.items(): if k in pred_per_type: From c7956a40474892b8459e5241de965e46ca388980 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 25 Sep 2020 09:25:46 +0200 Subject: [PATCH 14/16] Update models.js [ci skip] --- website/src/templates/models.js | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index cdfe2e46d..f67188c0b 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -78,10 +78,15 @@ function isStableVersion(v) { return !v.includes('a') && !v.includes('b') && !v.includes('dev') && !v.includes('rc') } -function getLatestVersion(modelId, compatibility) { +function getLatestVersion(modelId, compatibility, prereleases) { for (let [version, models] of Object.entries(compatibility)) { if (isStableVersion(version) && models[modelId]) { - return models[modelId][0] + const modelVersions = models[modelId] + for (let modelVersion of modelVersions) { + if (isStableVersion(modelVersion) || prereleases) { + return modelVersion + } + } } } } @@ -147,12 +152,26 @@ const Help = ({ children }) => ( ) -const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExamples, licenses }) => { +const Model = ({ + name, + langId, + langName, + baseUrl, + repo, + compatibility, + hasExamples, + licenses, + prereleases, +}) => { const [initialized, setInitialized] = useState(false) const [isError, setIsError] = useState(true) const [meta, setMeta] = useState({}) const { type, genre, size } = getModelComponents(name) - const version = useMemo(() => getLatestVersion(name, compatibility), [name, compatibility]) + const version = useMemo(() => getLatestVersion(name, compatibility, prereleases), [ + name, + compatibility, + prereleases, + ]) useEffect(() => { window.dispatchEvent(new Event('resize')) // scroll position for progress @@ -332,7 +351,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl const Models = ({ pageContext, repo, children }) => { const [initialized, setInitialized] = useState(false) const [compatibility, setCompatibility] = useState({}) - const { id, title, meta, hasExamples } = pageContext + const { id, title, meta } = pageContext const { models, isStarters } = meta const baseUrl = `https://raw.githubusercontent.com/${repo}/master` @@ -381,6 +400,7 @@ const Models = ({ pageContext, repo, children }) => { repo={repo} licenses={arrayToObj(site.siteMetadata.licenses, 'id')} hasExamples={meta.hasExamples} + prereleases={site.siteMetadata.nightly} /> )) } @@ -397,6 +417,7 @@ const query = graphql` query ModelsQuery { site { siteMetadata { + nightly licenses { id url From 2cfe9340a1727acf9fcfd23a6ac0c0f2c0215010 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 25 Sep 2020 13:21:20 +0200 Subject: [PATCH 15/16] Link model components [ci skip] --- website/src/templates/models.js | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index f67188c0b..8a73a6282 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -11,12 +11,23 @@ import { Table, Tr, Td, Th } from '../components/table' import Tag from '../components/tag' import { H2, Label } from '../components/typography' import Icon from '../components/icon' -import Link from '../components/link' +import Link, { OptionalLink } from '../components/link' import Infobox from '../components/infobox' import Accordion from '../components/accordion' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' import { isString, isEmptyObj } from '../components/util' +const COMPONENT_LINKS = { + tok2vec: '/api/tok2vec', + transformer: '/api/transformer', + tagger: '/api/tagger', + parser: '/api/dependencyparser', + ner: '/api/entityrecognizer', + lemmatizer: '/api/lemmatizer', + attribute_ruler: '/api/attributeruler', + senter: '/api/sentencerecognizer', +} + const MODEL_META = { core: 'Vocabulary, syntax, entities, vectors', core_sm: 'Vocabulary, syntax, entities', @@ -146,6 +157,18 @@ function formatSources(data = []) { )) } +function linkComponents(components = []) { + return join( + components.map(c => ( + + + {c} + + + )) + ) +} + const Help = ({ children }) => ( @@ -192,10 +215,8 @@ const Model = ({ const releaseTag = meta.fullName ? `/tag/${meta.fullName}` : '' const releaseUrl = `https://github.com/${repo}/releases/${releaseTag}` - const pipeline = - meta.pipeline && join(meta.pipeline.map(p => {p})) - const components = - meta.components && join(meta.components.map(p => {p})) + const pipeline = linkComponents(meta.pipeline) + const components = linkComponents(meta.components) const sources = formatSources(meta.sources) const author = !meta.url ? meta.author : {meta.author} const licenseUrl = licenses[meta.license] ? licenses[meta.license].url : null From 02a1b6ab839f4a07c3cb1fb727c847f58a1c44f9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 25 Sep 2020 13:21:43 +0200 Subject: [PATCH 16/16] Update links [ci skip] --- website/src/templates/models.js | 1 + 1 file changed, 1 insertion(+) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 8a73a6282..f9895334d 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -26,6 +26,7 @@ const COMPONENT_LINKS = { lemmatizer: '/api/lemmatizer', attribute_ruler: '/api/attributeruler', senter: '/api/sentencerecognizer', + morphologizer: '/api/morphologizer', } const MODEL_META = {