From 8b650f3a786094833cccd8686ab4d6d73330565c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 17 Sep 2020 21:10:41 +0200 Subject: [PATCH 01/14] Modify setting missing and blocked entity tokens In order to make it easier to construct `Doc` objects as training data, modify how missing and blocked entity tokens are set to prioritize setting `O` and missing entity tokens for training purposes over setting blocked entity tokens. * `Doc.ents` setter sets tokens outside entity spans to `O` regardless of the current state of each token * For `Doc.ents`, setting a span with a missing label sets the `ent_iob` to missing instead of blocked * `Doc.block_ents(spans)` marks spans as hard `O` for use with the `EntityRecognizer` --- spacy/tests/doc/test_doc_api.py | 18 ++++++++++++++++-- spacy/tests/parser/test_ner.py | 4 ++-- spacy/tokens/doc.pyx | 25 +++++++++++++++++++------ spacy/training/example.pyx | 4 +--- spacy/training/iob_utils.py | 12 ++++-------- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ce979d3d1..53c309ba5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer): assert len(tokens.ents) == 0 tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)] assert len(list(tokens.ents)) == 1 - assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] + assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2] assert tokens.ents[0].label_ == "PRODUCT" assert tokens.ents[0].start == 2 assert tokens.ents[0].end == 4 @@ -426,7 +426,7 @@ def test_has_annotation(en_vocab): doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] - doc.ents = [Span(doc, 0, 1, label="HELLO")] + doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")] for attr in attrs: assert doc.has_annotation(attr) @@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer): doc.is_nered with pytest.deprecated_call(): doc.is_sentenced + + +def test_block_ents(en_tokenizer): + doc = en_tokenizer("a b c d e") + doc.block_ents([doc[1:2], doc[3:5]]) + assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3] + assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0] + assert doc.ents == tuple() + + # invalid IOB repaired + doc.ents = [Span(doc, 3, 5, "ENT")] + assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1] + doc.block_ents([doc[3:4]]) + assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3] diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 548cd2697..b8fdf15f9 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -168,7 +168,7 @@ def test_accept_blocked_token(): ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity - doc2.ents = [(0, 3, 5)] + doc2.block_ents([doc2[3:5]]) assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.ents = [(0, self.start, self.end)] + doc.block_ents([doc[self.start:self.end]]) return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5c5443258..1bae84508 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -590,17 +590,16 @@ cdef class Doc: entity_type = 0 kb_id = 0 - # Set ent_iob to Missing (0) by default unless this token was nered before - ent_iob = 0 - if self.c[i].ent_iob != 0: - ent_iob = 2 + # Set ent_iob to Outside (2) by default + ent_iob = 2 # overwrite if the token was part of a specified entity if i in tokens_in_ents.keys(): ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] if entity_type is None or entity_type <= 0: - # Blocking this token from being overwritten by downstream NER - ent_iob = 3 + # Empty label: Missing, unset this token + ent_iob = 0 + entity_type = 0 elif ent_start == i: # Marking the start of an entity ent_iob = 3 @@ -612,6 +611,20 @@ cdef class Doc: self.c[i].ent_kb_id = kb_id self.c[i].ent_iob = ent_iob + def block_ents(self, spans): + """Mark spans as never an entity for the EntityRecognizer. + + spans (List[Span]): The spans to block as never entities. + """ + for span in spans: + for i in range(span.start, span.end): + self.c[i].ent_iob = 3 + self.c[i].ent_type = 0 + # if the following token is I, set to B + if span.end < self.length: + if self.c[span.end].ent_iob == 1: + self.c[span.end].ent_iob = 3 + @property def noun_chunks(self): """Iterate over the base noun phrases in the document. Yields base diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 3344704bf..d396a2040 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -172,7 +172,7 @@ cdef class Example: return output def get_aligned_ner(self): - if not self.y.is_nered: + if not self.y.has_annotation("ENT_IOB"): return [None] * len(self.x) # should this be 'missing' instead of 'None' ? x_ents = self.get_aligned_spans_y2x(self.y.ents) # Default to 'None' for missing values @@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data): spans_from_biluo_tags(doc, ner_data) ) elif isinstance(ner_data[0], Span): - # Ugh, this is super messy. Really hard to set O entities doc.ents = ner_data - doc.ents = [span for span in ner_data if span.label_] else: raise ValueError(Errors.E973) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index ceb5e16b8..33a4733ca 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -182,22 +182,18 @@ def tags_to_entities(tags): entities = [] start = None for i, tag in enumerate(tags): - if tag is None: - continue - if tag.startswith("O"): + if tag is None or tag.startswith("-"): # TODO: We shouldn't be getting these malformed inputs. Fix this. if start is not None: start = None else: entities.append(("", i, i)) - continue - elif tag == "-": - continue + elif tag.startswith("O"): + pass elif tag.startswith("I"): if start is None: raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) - continue - if tag.startswith("U"): + elif tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i From 177df15d89da7eccc1603c33b847a12c43a56e0c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Sep 2020 15:54:05 +0200 Subject: [PATCH 02/14] Implement Doc.set_ents --- spacy/errors.py | 9 +++ spacy/tests/doc/test_doc_api.py | 63 +++++++++++++++-- spacy/tests/parser/test_ner.py | 4 +- spacy/tokens/doc.pyx | 122 +++++++++++++++++++++++++++++--- spacy/training/example.pyx | 10 ++- spacy/training/iob_utils.py | 5 +- 6 files changed, 192 insertions(+), 21 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 81e3616be..a21ff5476 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -682,6 +682,15 @@ class Errors: E1009 = ("String for hash '{val}' not found in StringStore. Set the value " "through token.morph_ instead or add the string to the " "StringStore with `nlp.vocab.strings.add(string)`.") + E1010 = ("Unable to set entity information for token {i} which is included " + "in more than one span in entities, blocked, missing or outside.") + E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " + "options: {modes}") + E1012 = ("Spans provided to doc.set_ents must be provided as a list of " + "`Span` objects.") + E1013 = ("Unable to set entity for span with empty label. Entity spans are " + "required to have a label. To set entity information as missing " + "or blocked, use the keyword arguments with doc.set_ents.") @add_codes diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index a08efe9d7..7339a9aef 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -425,7 +425,7 @@ def test_has_annotation(en_vocab): doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] - doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")] + doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing") for attr in attrs: assert doc.has_annotation(attr) @@ -455,15 +455,68 @@ def test_is_flags_deprecated(en_tokenizer): doc.is_sentenced -def test_block_ents(en_tokenizer): +def test_set_ents(en_tokenizer): + # set ents doc = en_tokenizer("a b c d e") - doc.block_ents([doc[1:2], doc[3:5]]) + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) + assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # add ents, invalid IOB repaired + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) + doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified") + assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2] + assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0] + + # missing ents + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]]) + assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # outside ents + doc = en_tokenizer("a b c d e") + doc.set_ents( + [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], + outside=[doc[4:5]], + default="missing", + ) + assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # blocked ents + doc = en_tokenizer("a b c d e") + doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified") assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3] assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0] assert doc.ents == tuple() - # invalid IOB repaired + # invalid IOB repaired after blocked doc.ents = [Span(doc, 3, 5, "ENT")] assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1] - doc.block_ents([doc[3:4]]) + doc.set_ents([], blocked=[doc[3:4]], default="unmodified") assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3] + + # all types + doc = en_tokenizer("a b c d e") + doc.set_ents( + [Span(doc, 0, 1, 10)], + blocked=[doc[1:2]], + missing=[doc[2:3]], + outside=[doc[3:4]], + default="unmodified", + ) + assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0] + assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0] + + doc = en_tokenizer("a b c d e") + # single span instead of a list + with pytest.raises(ValueError): + doc.set_ents([], missing=doc[1:2]) + # invalid default mode + with pytest.raises(ValueError): + doc.set_ents([], missing=[doc[1:2]], default="none") + # conflicting/overlapping specifications + with pytest.raises(ValueError): + doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]]) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b8fdf15f9..cd5581769 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -168,7 +168,7 @@ def test_accept_blocked_token(): ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity - doc2.block_ents([doc2[3:5]]) + doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified") assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.block_ents([doc[self.start:self.end]]) + doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index cc621b443..be99bacf3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -7,6 +7,7 @@ from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter +from enum import Enum import numpy import srsly from thinc.api import get_array_module @@ -86,6 +87,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) return get_token_attr(token, feat_name) +class SetEntsDefault(str, Enum): + blocked = "blocked" + missing = "missing" + outside = "outside" + unmodified = "unmodified" + + @classmethod + def values(cls): + return list(cls.__members__.keys()) + + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary @@ -597,9 +609,9 @@ cdef class Doc: if i in tokens_in_ents.keys(): ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] if entity_type is None or entity_type <= 0: - # Empty label: Missing, unset this token - ent_iob = 0 - entity_type = 0 + # Only allow labelled spans + print(i, ent_start, ent_end, entity_type) + raise ValueError(Errors.E1013) elif ent_start == i: # Marking the start of an entity ent_iob = 3 @@ -611,19 +623,107 @@ cdef class Doc: self.c[i].ent_kb_id = kb_id self.c[i].ent_iob = ent_iob - def block_ents(self, spans): - """Mark spans as never an entity for the EntityRecognizer. + def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): + """Set entity annotation. - spans (List[Span]): The spans to block as never entities. + entities (List[Span]): Spans with labels to set as entities. + blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an + entity) for spacy's built-in NER component. Other components may + ignore this setting. + missing (Optional[List[Span]]): Spans with missing/unknown entity + information. + outside (Optional[List[Span]]): Spans outside of entities (O in IOB). + default (str): How to set entity annotation for tokens outside of any + provided spans. Options: "blocked", "missing", "outside" and + "unmodified" (preserve current state). Defaults to "outside". """ - for span in spans: + if default not in SetEntsDefault.values(): + raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault))) + + if blocked is None: + blocked = tuple() + if missing is None: + missing = tuple() + if outside is None: + outside = tuple() + + # Find all tokens covered by spans and check that none are overlapping + seen_tokens = set() + for span in entities: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + for span in blocked: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + for span in missing: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + for span in outside: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + + # Set all specified entity information + for span in entities: + for i in range(span.start, span.end): + if not span.label: + raise ValueError(Errors.E1013) + if i == span.start: + self.c[i].ent_iob = 3 + else: + self.c[i].ent_iob = 1 + self.c[i].ent_type = span.label + for span in blocked: for i in range(span.start, span.end): self.c[i].ent_iob = 3 self.c[i].ent_type = 0 - # if the following token is I, set to B - if span.end < self.length: - if self.c[span.end].ent_iob == 1: - self.c[span.end].ent_iob = 3 + for span in missing: + for i in range(span.start, span.end): + self.c[i].ent_iob = 0 + self.c[i].ent_type = 0 + for span in outside: + for i in range(span.start, span.end): + self.c[i].ent_iob = 2 + self.c[i].ent_type = 0 + + # Set tokens outside of all provided spans + if default != SetEntsDefault.unmodified: + for i in range(self.length): + if i not in seen_tokens: + self.c[i].ent_type = 0 + if default == SetEntsDefault.outside: + self.c[i].ent_iob = 2 + elif default == SetEntsDefault.missing: + self.c[i].ent_iob = 0 + elif default == SetEntsDefault.blocked: + self.c[i].ent_iob = 3 + + # Fix any resulting inconsistent annotation + for i in range(self.length - 1): + # I must follow B or I: convert I to B + if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \ + self.c[i+1].ent_iob == 1: + self.c[i+1].ent_iob = 3 + # Change of type with BI or II: convert second I to B + if self.c[i].ent_type != self.c[i+1].ent_type and \ + (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \ + self.c[i+1].ent_iob == 1: + self.c[i+1].ent_iob = 3 @property def noun_chunks(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index d396a2040..82d8b6fce 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot): def _add_entities_to_doc(doc, ner_data): + print(ner_data) if ner_data is None: return elif ner_data == []: @@ -303,7 +304,14 @@ def _add_entities_to_doc(doc, ner_data): spans_from_biluo_tags(doc, ner_data) ) elif isinstance(ner_data[0], Span): - doc.ents = ner_data + entities = [] + missing = [] + for span in ner_data: + if span.label: + entities.append(span) + else: + missing.append(span) + doc.set_ents(entities, missing=missing) else: raise ValueError(Errors.E973) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 33a4733ca..b435c8ecb 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -149,9 +149,10 @@ def spans_from_biluo_tags(doc, tags): doc (Doc): The document that the BILUO tags refer to. entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or + token. Each tag string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of Span objects. + RETURNS (list): A sequence of Span objects. Each token with a missing IOB + tag is returned as a Span with an empty label. """ token_offsets = tags_to_entities(tags) spans = [] From b1a7d6c528e08c4a80594ae6338cacb22bf8b5b1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 22 Sep 2020 14:42:51 +0200 Subject: [PATCH 03/14] Refactor seen token detection --- spacy/errors.py | 4 ++-- spacy/tokens/doc.pyx | 24 ++---------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index de4ffde3c..27091810d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -690,8 +690,8 @@ class Errors: "in more than one span in entities, blocked, missing or outside.") E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " "options: {modes}") - E1012 = ("Spans provided to doc.set_ents must be provided as a list of " - "`Span` objects.") + E1012 = ("Entity spans and blocked/missing/outside spans should be " + "provided to doc.set_ents as lists of `Span` objects.") E1013 = ("Unable to set entity for span with empty label. Entity spans are " "required to have a label. To set entity information as missing " "or blocked, use the keyword arguments with doc.set_ents.") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 34742e587..4bf6f0e5e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -8,6 +8,7 @@ from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter from enum import Enum +import itertools import numpy import srsly from thinc.api import get_array_module @@ -742,28 +743,7 @@ cdef class Doc: # Find all tokens covered by spans and check that none are overlapping seen_tokens = set() - for span in entities: - if not isinstance(span, Span): - raise ValueError(Errors.E1012.format(span=span)) - for i in range(span.start, span.end): - if i in seen_tokens: - raise ValueError(Errors.E1010.format(i=i)) - seen_tokens.add(i) - for span in blocked: - if not isinstance(span, Span): - raise ValueError(Errors.E1012.format(span=span)) - for i in range(span.start, span.end): - if i in seen_tokens: - raise ValueError(Errors.E1010.format(i=i)) - seen_tokens.add(i) - for span in missing: - if not isinstance(span, Span): - raise ValueError(Errors.E1012.format(span=span)) - for i in range(span.start, span.end): - if i in seen_tokens: - raise ValueError(Errors.E1010.format(i=i)) - seen_tokens.add(i) - for span in outside: + for span in itertools.chain.from_iterable([entities, blocked, missing, outside]): if not isinstance(span, Span): raise ValueError(Errors.E1012.format(span=span)) for i in range(span.start, span.end): From 8eaacaae97f0caf77576e843a8d6bcf866c79236 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 12:36:51 +0200 Subject: [PATCH 04/14] Refactor Doc.ents setter to use Doc.set_ents Additional changes: * Entity spans with missing labels are ignored * Fix ent_kb_id setting in `Doc.set_ents` --- spacy/tests/doc/test_add_entities.py | 4 +-- spacy/tests/doc/test_doc_api.py | 2 +- spacy/tokens/doc.pyx | 50 ++++++---------------------- 3 files changed, 14 insertions(+), 42 deletions(-) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 40aff8e31..615ab9e5b 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] + doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] - doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] + doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 892b65cf4..e5e72fe2a 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -534,4 +534,4 @@ def test_doc_ents_setter(): vocab = Vocab() ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] doc = Doc(vocab, words=words, ents=ents) - assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] \ No newline at end of file + assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4bf6f0e5e..670c7440f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -673,49 +673,16 @@ cdef class Doc: # TODO: # 1. Test basic data-driven ORTH gazetteer # 2. Test more nuanced date and currency regex - tokens_in_ents = {} - cdef attr_t entity_type - cdef attr_t kb_id - cdef int ent_start, ent_end, token_index + cdef attr_t entity_type, kb_id + cdef int ent_start, ent_end + ent_spans = [] for ent_info in ents: entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info) if isinstance(entity_type_, str): self.vocab.strings.add(entity_type_) - entity_type = self.vocab.strings.as_int(entity_type_) - for token_index in range(ent_start, ent_end): - if token_index in tokens_in_ents: - raise ValueError(Errors.E103.format( - span1=(tokens_in_ents[token_index][0], - tokens_in_ents[token_index][1], - self.vocab.strings[tokens_in_ents[token_index][2]]), - span2=(ent_start, ent_end, self.vocab.strings[entity_type]))) - tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id) - cdef int i - for i in range(self.length): - # default values - entity_type = 0 - kb_id = 0 - - # Set ent_iob to Outside (2) by default - ent_iob = 2 - - # overwrite if the token was part of a specified entity - if i in tokens_in_ents.keys(): - ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] - if entity_type is None or entity_type <= 0: - # Only allow labelled spans - print(i, ent_start, ent_end, entity_type) - raise ValueError(Errors.E1013) - elif ent_start == i: - # Marking the start of an entity - ent_iob = 3 - else: - # Marking the inside of an entity - ent_iob = 1 - - self.c[i].ent_type = entity_type - self.c[i].ent_kb_id = kb_id - self.c[i].ent_iob = ent_iob + span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id) + ent_spans.append(span) + self.set_ents(ent_spans, default=SetEntsDefault.outside) def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): """Set entity annotation. @@ -734,6 +701,9 @@ cdef class Doc: if default not in SetEntsDefault.values(): raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault))) + # Ignore spans with missing labels + entities = [ent for ent in entities if ent.label > 0] + if blocked is None: blocked = tuple() if missing is None: @@ -742,6 +712,7 @@ cdef class Doc: outside = tuple() # Find all tokens covered by spans and check that none are overlapping + cdef int i seen_tokens = set() for span in itertools.chain.from_iterable([entities, blocked, missing, outside]): if not isinstance(span, Span): @@ -761,6 +732,7 @@ cdef class Doc: else: self.c[i].ent_iob = 1 self.c[i].ent_type = span.label + self.c[i].ent_kb_id = span.kb_id for span in blocked: for i in range(span.start, span.end): self.c[i].ent_iob = 3 From be56c0994b09a8ba5042eb563d05ea5bb7f75a6d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 12:40:25 +0200 Subject: [PATCH 05/14] Add [training.before_to_disk] callback --- spacy/cli/train.py | 18 ++++++++++++++++++ spacy/default_config.cfg | 2 ++ spacy/errors.py | 3 +++ spacy/schemas.py | 1 + 4 files changed, 24 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index eabc82be0..6d61c2425 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -97,6 +97,7 @@ def train( dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] + before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T_cfg["frozen_components"] # Sourced components that require resume_training @@ -167,6 +168,7 @@ def train( with nlp.select_pipes(disable=frozen_components): update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) progress.set_description(f"Epoch {info['epoch']}") @@ -179,6 +181,7 @@ def train( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" ) + nlp = before_to_disk(nlp) nlp.to_disk(output_path / "model-final") raise e finally: @@ -233,6 +236,21 @@ def create_evaluation_callback( return evaluate +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk + + def train_while_improving( nlp: Language, optimizer: Optimizer, diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 5cd97a0eb..6f8c0aa00 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -72,6 +72,8 @@ frozen_components = [] dev_corpus = "corpora.dev" # Location in the config where the train corpus is defined train_corpus = "corpora.train" +# Optional callback before nlp object is saved to disk after training +before_to_disk = null [training.logger] @loggers = "spacy.ConsoleLogger.v1" diff --git a/spacy/errors.py b/spacy/errors.py index dce5cf51c..d67f01a1d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,9 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E914 = ("Executing {name} callback failed. Expected the function to " + "returnthe nlp object but got: {value}. Maybe you forgot to return " + "the modified object in your function?") E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " "float or int but got: {score_type}. To exclude the score from the " "final score, set its weight to null in the [training.score_weights] " diff --git a/spacy/schemas.py b/spacy/schemas.py index e34841008..6a9a82d06 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -217,6 +217,7 @@ class ConfigSchemaTraining(BaseModel): optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") + before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") # fmt: on class Config: From 138c8d45dbd1372fafe6b280fdedf33790d20d32 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 12:43:39 +0200 Subject: [PATCH 06/14] Update docs --- website/docs/api/data-formats.md | 45 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index e3b3900be..6f156fe37 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -275,8 +276,8 @@ $ python -m spacy convert ./data.json ./output.spacy > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > representing a `PERSON` entity. The -> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function -> can help you convert entity offsets to the right format. +> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can +> help you convert entity offsets to the right format. ```python ### Example structure From 1c63f02f99d6c3d663c4a9cfb0e3395986bd7598 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 12:51:16 +0200 Subject: [PATCH 07/14] Add API docs --- website/docs/api/doc.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 7175f6e7f..e10d9d077 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -219,6 +219,30 @@ alignment mode `"strict". | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +## Doc.set_ents {#ents tag="method" new="3"} + +Set the named entities in the document. + +> #### Example +> +> ```python +> from spacy.tokens import Span +> doc = nlp("Mr. Best flew to New York on Saturday morning.") +> doc.set_ents([Span(doc, 0, 2, "PERSON")]) +> ents = list(doc.ents) +> assert ents[0].label_ == "PERSON" +> assert ents[0].text == "Mr. Best" +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| entities | Spans with labels to set as entities. ~~List[Span]~~ | +| _keyword-only_ | | +| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ | +| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ | +| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ | +| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ | + ## Doc.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied. > ```python > doc = nlp("Mr. Best flew to New York on Saturday morning.") > ents = list(doc.ents) -> assert ents[0].label == 346 > assert ents[0].label_ == "PERSON" > assert ents[0].text == "Mr. Best" > ``` From 5c13e0cf1bdf536c54660340e71742bf0493ea07 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 13:41:55 +0200 Subject: [PATCH 08/14] Remove unused error --- spacy/errors.py | 3 --- spacy/tokens/doc.pyx | 2 -- 2 files changed, 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 27091810d..998e57f27 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -692,9 +692,6 @@ class Errors: "options: {modes}") E1012 = ("Entity spans and blocked/missing/outside spans should be " "provided to doc.set_ents as lists of `Span` objects.") - E1013 = ("Unable to set entity for span with empty label. Entity spans are " - "required to have a label. To set entity information as missing " - "or blocked, use the keyword arguments with doc.set_ents.") @add_codes diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 670c7440f..b4027f87e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -725,8 +725,6 @@ cdef class Doc: # Set all specified entity information for span in entities: for i in range(span.start, span.end): - if not span.label: - raise ValueError(Errors.E1013) if i == span.start: self.c[i].ent_iob = 3 else: From 92f8b6959a359ff4495205df42f9e86c30aeb8f6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 13:48:41 +0200 Subject: [PATCH 09/14] Fix typo --- spacy/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index d67f01a1d..708b7fda8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -481,7 +481,7 @@ class Errors: # TODO: fix numbering after merging develop into master E914 = ("Executing {name} callback failed. Expected the function to " - "returnthe nlp object but got: {value}. Maybe you forgot to return " + "return the nlp object but got: {value}. Maybe you forgot to return " "the modified object in your function?") E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " "float or int but got: {score_type}. To exclude the score from the " From 88e54caa1275481a43b1069c8ec6d352f554e333 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:32:35 +0200 Subject: [PATCH 10/14] accuracy -> performance --- spacy/cli/info.py | 4 +++- spacy/schemas.py | 3 +-- website/docs/api/data-formats.md | 2 +- website/src/templates/models.js | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 2b87163c2..2f2515278 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -91,7 +91,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: meta["source"] = str(model_path.resolve()) else: meta["source"] = str(model_path) - return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")} + return { + k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed") + } def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str: diff --git a/spacy/schemas.py b/spacy/schemas.py index e34841008..1ff73bccc 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -182,8 +182,7 @@ class ModelMetaSchema(BaseModel): sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") vectors: Dict[str, Any] = Field({}, title="Included word vectors") labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") - accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") - speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") + performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") # fmt: on diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index e3b3900be..34565f160 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -518,7 +518,7 @@ source of truth** used for loading a pipeline. > "ner": ["PERSON", "ORG", "PRODUCT"], > "textcat": ["POSITIVE", "NEGATIVE"] > }, -> "accuracy": { +> "performance": { > "ents_f": 82.7300930714, > "ents_p": 82.135523614, > "ents_r": 83.3333333333, diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 5d705048b..413f23dc5 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -117,7 +117,7 @@ function formatModelMeta(data) { license: data.license, labels: isEmptyObj(data.labels) ? null : data.labels, vectors: formatVectors(data.vectors), - accuracy: formatAccuracy(data.accuracy), + accuracy: formatAccuracy(data.performance), } } From 3b58a8be2b32b29a4a121bf0ed75ae3cd2920ee9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:32:42 +0200 Subject: [PATCH 11/14] Update docs --- website/docs/api/data-formats.md | 4 ++-- website/docs/usage/_benchmarks-models.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 34565f160..0fc3481a4 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -275,8 +275,8 @@ $ python -m spacy convert ./data.json ./output.spacy > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > representing a `PERSON` entity. The -> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function -> can help you convert entity offsets to the right format. +> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can +> help you convert entity offsets to the right format. ```python ### Example structure diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 5b193d3a4..88e79112f 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -7,7 +7,7 @@ import { Help } from 'components/typography'; import Link from 'components/link' | Pipeline | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | -| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.1 | 97.4 | 87.0 | 7k | | | `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
From 24e7ac3f2bbdab6a1e124c2770c7545cd08906c8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:43:56 +0200 Subject: [PATCH 12/14] Fix download CLI [ci skip] --- spacy/cli/download.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 036aeab17..0e7ec2ea5 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -88,7 +88,6 @@ def get_compatibility() -> dict: def get_version(model: str, comp: dict) -> str: - model = get_base_version(model) if model not in comp: msg.fail( f"No compatible package found for '{model}' (spaCy v{about.__version__})", From 3f751e68f596d1c186e0baa125a6cba1ff6a7995 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:45:41 +0200 Subject: [PATCH 13/14] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 8d019897b..56b05257a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a23" +__version__ = "3.0.0a24" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 6bc5058d137daa28184c0494f9380b7832770c59 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:53:34 +0200 Subject: [PATCH 14/14] Update models directory [ci skip] --- website/src/templates/models.js | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 413f23dc5..cdfe2e46d 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -32,11 +32,17 @@ const MODEL_META = { las: 'Labelled dependencies', token_acc: 'Tokenization', tok: 'Tokenization', + lemma: 'Statistical lemmatization', + morph: 'Morphological analysis', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tag: 'Part-of-speech tags (fine grained tags, Token.tag)', + pos: 'Part-of-speech tags (coarse grained tags, Token.pos)', ents_f: 'Named entities (F-score)', ents_p: 'Named entities (precision)', ents_r: 'Named entities (recall)', + ner_f: 'Named entities (F-score)', + ner_p: 'Named entities (precision)', + ner_r: 'Named entities (recall)', sent_f: 'Sentence segmentation (F-score)', sent_p: 'Sentence segmentation (precision)', sent_r: 'Sentence segmentation (recall)', @@ -88,11 +94,12 @@ function formatVectors(data) { } function formatAccuracy(data) { + const exclude = ['speed'] if (!data) return [] return Object.keys(data) .map(label => { const value = data[label] - return isNaN(value) + return isNaN(value) || exclude.includes(label) ? null : { label, @@ -109,6 +116,7 @@ function formatModelMeta(data) { version: data.version, sizeFull: data.size, pipeline: data.pipeline, + components: data.components, notes: data.notes, description: data.description, sources: data.sources, @@ -117,7 +125,8 @@ function formatModelMeta(data) { license: data.license, labels: isEmptyObj(data.labels) ? null : data.labels, vectors: formatVectors(data.vectors), - accuracy: formatAccuracy(data.performance), + // TODO: remove accuracy fallback + accuracy: formatAccuracy(data.accuracy || data.performance), } }