From 8b650f3a786094833cccd8686ab4d6d73330565c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 17 Sep 2020 21:10:41 +0200 Subject: [PATCH 01/15] Modify setting missing and blocked entity tokens In order to make it easier to construct `Doc` objects as training data, modify how missing and blocked entity tokens are set to prioritize setting `O` and missing entity tokens for training purposes over setting blocked entity tokens. * `Doc.ents` setter sets tokens outside entity spans to `O` regardless of the current state of each token * For `Doc.ents`, setting a span with a missing label sets the `ent_iob` to missing instead of blocked * `Doc.block_ents(spans)` marks spans as hard `O` for use with the `EntityRecognizer` --- spacy/tests/doc/test_doc_api.py | 18 ++++++++++++++++-- spacy/tests/parser/test_ner.py | 4 ++-- spacy/tokens/doc.pyx | 25 +++++++++++++++++++------ spacy/training/example.pyx | 4 +--- spacy/training/iob_utils.py | 12 ++++-------- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index ce979d3d1..53c309ba5 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -137,7 +137,7 @@ def test_doc_api_set_ents(en_tokenizer): assert len(tokens.ents) == 0 tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)] assert len(list(tokens.ents)) == 1 - assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] + assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2] assert tokens.ents[0].label_ == "PRODUCT" assert tokens.ents[0].start == 2 assert tokens.ents[0].end == 4 @@ -426,7 +426,7 @@ def test_has_annotation(en_vocab): doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] - doc.ents = [Span(doc, 0, 1, label="HELLO")] + doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")] for attr in attrs: assert doc.has_annotation(attr) @@ -454,3 +454,17 @@ def test_is_flags_deprecated(en_tokenizer): doc.is_nered with pytest.deprecated_call(): doc.is_sentenced + + +def test_block_ents(en_tokenizer): + doc = en_tokenizer("a b c d e") + doc.block_ents([doc[1:2], doc[3:5]]) + assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3] + assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0] + assert doc.ents == tuple() + + # invalid IOB repaired + doc.ents = [Span(doc, 3, 5, "ENT")] + assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1] + doc.block_ents([doc[3:4]]) + assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3] diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 548cd2697..b8fdf15f9 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -168,7 +168,7 @@ def test_accept_blocked_token(): ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity - doc2.ents = [(0, 3, 5)] + doc2.block_ents([doc2[3:5]]) assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.ents = [(0, self.start, self.end)] + doc.block_ents([doc[self.start:self.end]]) return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 5c5443258..1bae84508 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -590,17 +590,16 @@ cdef class Doc: entity_type = 0 kb_id = 0 - # Set ent_iob to Missing (0) by default unless this token was nered before - ent_iob = 0 - if self.c[i].ent_iob != 0: - ent_iob = 2 + # Set ent_iob to Outside (2) by default + ent_iob = 2 # overwrite if the token was part of a specified entity if i in tokens_in_ents.keys(): ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] if entity_type is None or entity_type <= 0: - # Blocking this token from being overwritten by downstream NER - ent_iob = 3 + # Empty label: Missing, unset this token + ent_iob = 0 + entity_type = 0 elif ent_start == i: # Marking the start of an entity ent_iob = 3 @@ -612,6 +611,20 @@ cdef class Doc: self.c[i].ent_kb_id = kb_id self.c[i].ent_iob = ent_iob + def block_ents(self, spans): + """Mark spans as never an entity for the EntityRecognizer. + + spans (List[Span]): The spans to block as never entities. + """ + for span in spans: + for i in range(span.start, span.end): + self.c[i].ent_iob = 3 + self.c[i].ent_type = 0 + # if the following token is I, set to B + if span.end < self.length: + if self.c[span.end].ent_iob == 1: + self.c[span.end].ent_iob = 3 + @property def noun_chunks(self): """Iterate over the base noun phrases in the document. Yields base diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index 3344704bf..d396a2040 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -172,7 +172,7 @@ cdef class Example: return output def get_aligned_ner(self): - if not self.y.is_nered: + if not self.y.has_annotation("ENT_IOB"): return [None] * len(self.x) # should this be 'missing' instead of 'None' ? x_ents = self.get_aligned_spans_y2x(self.y.ents) # Default to 'None' for missing values @@ -303,9 +303,7 @@ def _add_entities_to_doc(doc, ner_data): spans_from_biluo_tags(doc, ner_data) ) elif isinstance(ner_data[0], Span): - # Ugh, this is super messy. Really hard to set O entities doc.ents = ner_data - doc.ents = [span for span in ner_data if span.label_] else: raise ValueError(Errors.E973) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index ceb5e16b8..33a4733ca 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -182,22 +182,18 @@ def tags_to_entities(tags): entities = [] start = None for i, tag in enumerate(tags): - if tag is None: - continue - if tag.startswith("O"): + if tag is None or tag.startswith("-"): # TODO: We shouldn't be getting these malformed inputs. Fix this. if start is not None: start = None else: entities.append(("", i, i)) - continue - elif tag == "-": - continue + elif tag.startswith("O"): + pass elif tag.startswith("I"): if start is None: raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1])) - continue - if tag.startswith("U"): + elif tag.startswith("U"): entities.append((tag[2:], i, i)) elif tag.startswith("B"): start = i From 177df15d89da7eccc1603c33b847a12c43a56e0c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Sep 2020 15:54:05 +0200 Subject: [PATCH 02/15] Implement Doc.set_ents --- spacy/errors.py | 9 +++ spacy/tests/doc/test_doc_api.py | 63 +++++++++++++++-- spacy/tests/parser/test_ner.py | 4 +- spacy/tokens/doc.pyx | 122 +++++++++++++++++++++++++++++--- spacy/training/example.pyx | 10 ++- spacy/training/iob_utils.py | 5 +- 6 files changed, 192 insertions(+), 21 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 81e3616be..a21ff5476 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -682,6 +682,15 @@ class Errors: E1009 = ("String for hash '{val}' not found in StringStore. Set the value " "through token.morph_ instead or add the string to the " "StringStore with `nlp.vocab.strings.add(string)`.") + E1010 = ("Unable to set entity information for token {i} which is included " + "in more than one span in entities, blocked, missing or outside.") + E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " + "options: {modes}") + E1012 = ("Spans provided to doc.set_ents must be provided as a list of " + "`Span` objects.") + E1013 = ("Unable to set entity for span with empty label. Entity spans are " + "required to have a label. To set entity information as missing " + "or blocked, use the keyword arguments with doc.set_ents.") @add_codes diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index a08efe9d7..7339a9aef 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -425,7 +425,7 @@ def test_has_annotation(en_vocab): doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] - doc.ents = [Span(doc, 0, 1, label="HELLO"), Span(doc, 1, 2, label="")] + doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing") for attr in attrs: assert doc.has_annotation(attr) @@ -455,15 +455,68 @@ def test_is_flags_deprecated(en_tokenizer): doc.is_sentenced -def test_block_ents(en_tokenizer): +def test_set_ents(en_tokenizer): + # set ents doc = en_tokenizer("a b c d e") - doc.block_ents([doc[1:2], doc[3:5]]) + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) + assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # add ents, invalid IOB repaired + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)]) + doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified") + assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2] + assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0] + + # missing ents + doc = en_tokenizer("a b c d e") + doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]]) + assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # outside ents + doc = en_tokenizer("a b c d e") + doc.set_ents( + [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], + outside=[doc[4:5]], + default="missing", + ) + assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2] + assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0] + + # blocked ents + doc = en_tokenizer("a b c d e") + doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified") assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3] assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0] assert doc.ents == tuple() - # invalid IOB repaired + # invalid IOB repaired after blocked doc.ents = [Span(doc, 3, 5, "ENT")] assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1] - doc.block_ents([doc[3:4]]) + doc.set_ents([], blocked=[doc[3:4]], default="unmodified") assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3] + + # all types + doc = en_tokenizer("a b c d e") + doc.set_ents( + [Span(doc, 0, 1, 10)], + blocked=[doc[1:2]], + missing=[doc[2:3]], + outside=[doc[3:4]], + default="unmodified", + ) + assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0] + assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0] + + doc = en_tokenizer("a b c d e") + # single span instead of a list + with pytest.raises(ValueError): + doc.set_ents([], missing=doc[1:2]) + # invalid default mode + with pytest.raises(ValueError): + doc.set_ents([], missing=[doc[1:2]], default="none") + # conflicting/overlapping specifications + with pytest.raises(ValueError): + doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]]) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index b8fdf15f9..cd5581769 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -168,7 +168,7 @@ def test_accept_blocked_token(): ner2 = nlp2.create_pipe("ner", config=config) # set "New York" to a blocked entity - doc2.block_ents([doc2[3:5]]) + doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified") assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"] assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""] @@ -358,5 +358,5 @@ class BlockerComponent1: self.name = name def __call__(self, doc): - doc.block_ents([doc[self.start:self.end]]) + doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified") return doc diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index cc621b443..be99bacf3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -7,6 +7,7 @@ from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter +from enum import Enum import numpy import srsly from thinc.api import get_array_module @@ -86,6 +87,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) return get_token_attr(token, feat_name) +class SetEntsDefault(str, Enum): + blocked = "blocked" + missing = "missing" + outside = "outside" + unmodified = "unmodified" + + @classmethod + def values(cls): + return list(cls.__members__.keys()) + + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary @@ -597,9 +609,9 @@ cdef class Doc: if i in tokens_in_ents.keys(): ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] if entity_type is None or entity_type <= 0: - # Empty label: Missing, unset this token - ent_iob = 0 - entity_type = 0 + # Only allow labelled spans + print(i, ent_start, ent_end, entity_type) + raise ValueError(Errors.E1013) elif ent_start == i: # Marking the start of an entity ent_iob = 3 @@ -611,19 +623,107 @@ cdef class Doc: self.c[i].ent_kb_id = kb_id self.c[i].ent_iob = ent_iob - def block_ents(self, spans): - """Mark spans as never an entity for the EntityRecognizer. + def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): + """Set entity annotation. - spans (List[Span]): The spans to block as never entities. + entities (List[Span]): Spans with labels to set as entities. + blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an + entity) for spacy's built-in NER component. Other components may + ignore this setting. + missing (Optional[List[Span]]): Spans with missing/unknown entity + information. + outside (Optional[List[Span]]): Spans outside of entities (O in IOB). + default (str): How to set entity annotation for tokens outside of any + provided spans. Options: "blocked", "missing", "outside" and + "unmodified" (preserve current state). Defaults to "outside". """ - for span in spans: + if default not in SetEntsDefault.values(): + raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault))) + + if blocked is None: + blocked = tuple() + if missing is None: + missing = tuple() + if outside is None: + outside = tuple() + + # Find all tokens covered by spans and check that none are overlapping + seen_tokens = set() + for span in entities: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + for span in blocked: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + for span in missing: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + for span in outside: + if not isinstance(span, Span): + raise ValueError(Errors.E1012.format(span=span)) + for i in range(span.start, span.end): + if i in seen_tokens: + raise ValueError(Errors.E1010.format(i=i)) + seen_tokens.add(i) + + # Set all specified entity information + for span in entities: + for i in range(span.start, span.end): + if not span.label: + raise ValueError(Errors.E1013) + if i == span.start: + self.c[i].ent_iob = 3 + else: + self.c[i].ent_iob = 1 + self.c[i].ent_type = span.label + for span in blocked: for i in range(span.start, span.end): self.c[i].ent_iob = 3 self.c[i].ent_type = 0 - # if the following token is I, set to B - if span.end < self.length: - if self.c[span.end].ent_iob == 1: - self.c[span.end].ent_iob = 3 + for span in missing: + for i in range(span.start, span.end): + self.c[i].ent_iob = 0 + self.c[i].ent_type = 0 + for span in outside: + for i in range(span.start, span.end): + self.c[i].ent_iob = 2 + self.c[i].ent_type = 0 + + # Set tokens outside of all provided spans + if default != SetEntsDefault.unmodified: + for i in range(self.length): + if i not in seen_tokens: + self.c[i].ent_type = 0 + if default == SetEntsDefault.outside: + self.c[i].ent_iob = 2 + elif default == SetEntsDefault.missing: + self.c[i].ent_iob = 0 + elif default == SetEntsDefault.blocked: + self.c[i].ent_iob = 3 + + # Fix any resulting inconsistent annotation + for i in range(self.length - 1): + # I must follow B or I: convert I to B + if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \ + self.c[i+1].ent_iob == 1: + self.c[i+1].ent_iob = 3 + # Change of type with BI or II: convert second I to B + if self.c[i].ent_type != self.c[i+1].ent_type and \ + (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \ + self.c[i+1].ent_iob == 1: + self.c[i+1].ent_iob = 3 @property def noun_chunks(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index d396a2040..82d8b6fce 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot): def _add_entities_to_doc(doc, ner_data): + print(ner_data) if ner_data is None: return elif ner_data == []: @@ -303,7 +304,14 @@ def _add_entities_to_doc(doc, ner_data): spans_from_biluo_tags(doc, ner_data) ) elif isinstance(ner_data[0], Span): - doc.ents = ner_data + entities = [] + missing = [] + for span in ner_data: + if span.label: + entities.append(span) + else: + missing.append(span) + doc.set_ents(entities, missing=missing) else: raise ValueError(Errors.E973) diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py index 33a4733ca..b435c8ecb 100644 --- a/spacy/training/iob_utils.py +++ b/spacy/training/iob_utils.py @@ -149,9 +149,10 @@ def spans_from_biluo_tags(doc, tags): doc (Doc): The document that the BILUO tags refer to. entities (iterable): A sequence of BILUO tags with each tag describing one - token. Each tags string will be of the form of either "", "O" or + token. Each tag string will be of the form of either "", "O" or "{action}-{label}", where action is one of "B", "I", "L", "U". - RETURNS (list): A sequence of Span objects. + RETURNS (list): A sequence of Span objects. Each token with a missing IOB + tag is returned as a Span with an empty label. """ token_offsets = tags_to_entities(tags) spans = [] From b1a7d6c528e08c4a80594ae6338cacb22bf8b5b1 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 22 Sep 2020 14:42:51 +0200 Subject: [PATCH 03/15] Refactor seen token detection --- spacy/errors.py | 4 ++-- spacy/tokens/doc.pyx | 24 ++---------------------- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index de4ffde3c..27091810d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -690,8 +690,8 @@ class Errors: "in more than one span in entities, blocked, missing or outside.") E1011 = ("Unsupported default '{default}' in doc.set_ents. Available " "options: {modes}") - E1012 = ("Spans provided to doc.set_ents must be provided as a list of " - "`Span` objects.") + E1012 = ("Entity spans and blocked/missing/outside spans should be " + "provided to doc.set_ents as lists of `Span` objects.") E1013 = ("Unable to set entity for span with empty label. Entity spans are " "required to have a label. To set entity information as missing " "or blocked, use the keyword arguments with doc.set_ents.") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 34742e587..4bf6f0e5e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -8,6 +8,7 @@ from libc.stdint cimport int32_t, uint64_t import copy from collections import Counter from enum import Enum +import itertools import numpy import srsly from thinc.api import get_array_module @@ -742,28 +743,7 @@ cdef class Doc: # Find all tokens covered by spans and check that none are overlapping seen_tokens = set() - for span in entities: - if not isinstance(span, Span): - raise ValueError(Errors.E1012.format(span=span)) - for i in range(span.start, span.end): - if i in seen_tokens: - raise ValueError(Errors.E1010.format(i=i)) - seen_tokens.add(i) - for span in blocked: - if not isinstance(span, Span): - raise ValueError(Errors.E1012.format(span=span)) - for i in range(span.start, span.end): - if i in seen_tokens: - raise ValueError(Errors.E1010.format(i=i)) - seen_tokens.add(i) - for span in missing: - if not isinstance(span, Span): - raise ValueError(Errors.E1012.format(span=span)) - for i in range(span.start, span.end): - if i in seen_tokens: - raise ValueError(Errors.E1010.format(i=i)) - seen_tokens.add(i) - for span in outside: + for span in itertools.chain.from_iterable([entities, blocked, missing, outside]): if not isinstance(span, Span): raise ValueError(Errors.E1012.format(span=span)) for i in range(span.start, span.end): From 8eaacaae97f0caf77576e843a8d6bcf866c79236 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 12:36:51 +0200 Subject: [PATCH 04/15] Refactor Doc.ents setter to use Doc.set_ents Additional changes: * Entity spans with missing labels are ignored * Fix ent_kb_id setting in `Doc.set_ents` --- spacy/tests/doc/test_add_entities.py | 4 +-- spacy/tests/doc/test_doc_api.py | 2 +- spacy/tokens/doc.pyx | 50 ++++++---------------------- 3 files changed, 14 insertions(+), 42 deletions(-) diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 40aff8e31..615ab9e5b 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner.begin_training(lambda: [_ner_example(ner)]) ner(doc) - doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] + doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] - doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] + doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 892b65cf4..e5e72fe2a 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -534,4 +534,4 @@ def test_doc_ents_setter(): vocab = Vocab() ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] doc = Doc(vocab, words=words, ents=ents) - assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] \ No newline at end of file + assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 4bf6f0e5e..670c7440f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -673,49 +673,16 @@ cdef class Doc: # TODO: # 1. Test basic data-driven ORTH gazetteer # 2. Test more nuanced date and currency regex - tokens_in_ents = {} - cdef attr_t entity_type - cdef attr_t kb_id - cdef int ent_start, ent_end, token_index + cdef attr_t entity_type, kb_id + cdef int ent_start, ent_end + ent_spans = [] for ent_info in ents: entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info) if isinstance(entity_type_, str): self.vocab.strings.add(entity_type_) - entity_type = self.vocab.strings.as_int(entity_type_) - for token_index in range(ent_start, ent_end): - if token_index in tokens_in_ents: - raise ValueError(Errors.E103.format( - span1=(tokens_in_ents[token_index][0], - tokens_in_ents[token_index][1], - self.vocab.strings[tokens_in_ents[token_index][2]]), - span2=(ent_start, ent_end, self.vocab.strings[entity_type]))) - tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id) - cdef int i - for i in range(self.length): - # default values - entity_type = 0 - kb_id = 0 - - # Set ent_iob to Outside (2) by default - ent_iob = 2 - - # overwrite if the token was part of a specified entity - if i in tokens_in_ents.keys(): - ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i] - if entity_type is None or entity_type <= 0: - # Only allow labelled spans - print(i, ent_start, ent_end, entity_type) - raise ValueError(Errors.E1013) - elif ent_start == i: - # Marking the start of an entity - ent_iob = 3 - else: - # Marking the inside of an entity - ent_iob = 1 - - self.c[i].ent_type = entity_type - self.c[i].ent_kb_id = kb_id - self.c[i].ent_iob = ent_iob + span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id) + ent_spans.append(span) + self.set_ents(ent_spans, default=SetEntsDefault.outside) def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): """Set entity annotation. @@ -734,6 +701,9 @@ cdef class Doc: if default not in SetEntsDefault.values(): raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault))) + # Ignore spans with missing labels + entities = [ent for ent in entities if ent.label > 0] + if blocked is None: blocked = tuple() if missing is None: @@ -742,6 +712,7 @@ cdef class Doc: outside = tuple() # Find all tokens covered by spans and check that none are overlapping + cdef int i seen_tokens = set() for span in itertools.chain.from_iterable([entities, blocked, missing, outside]): if not isinstance(span, Span): @@ -761,6 +732,7 @@ cdef class Doc: else: self.c[i].ent_iob = 1 self.c[i].ent_type = span.label + self.c[i].ent_kb_id = span.kb_id for span in blocked: for i in range(span.start, span.end): self.c[i].ent_iob = 3 From be56c0994b09a8ba5042eb563d05ea5bb7f75a6d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 12:40:25 +0200 Subject: [PATCH 05/15] Add [training.before_to_disk] callback --- spacy/cli/train.py | 18 ++++++++++++++++++ spacy/default_config.cfg | 2 ++ spacy/errors.py | 3 +++ spacy/schemas.py | 1 + 4 files changed, 24 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index eabc82be0..6d61c2425 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -97,6 +97,7 @@ def train( dev_corpus = dot_to_object(config, T_cfg["dev_corpus"]) batcher = T_cfg["batcher"] train_logger = T_cfg["logger"] + before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"]) # Components that shouldn't be updated during training frozen_components = T_cfg["frozen_components"] # Sourced components that require resume_training @@ -167,6 +168,7 @@ def train( with nlp.select_pipes(disable=frozen_components): update_meta(T_cfg, nlp, info) with nlp.use_params(optimizer.averages): + nlp = before_to_disk(nlp) nlp.to_disk(output_path / "model-best") progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False) progress.set_description(f"Epoch {info['epoch']}") @@ -179,6 +181,7 @@ def train( f"Aborting and saving the final best model. " f"Encountered exception: {str(e)}" ) + nlp = before_to_disk(nlp) nlp.to_disk(output_path / "model-final") raise e finally: @@ -233,6 +236,21 @@ def create_evaluation_callback( return evaluate +def create_before_to_disk_callback( + callback: Optional[Callable[[Language], Language]] +) -> Callable[[Language], Language]: + def before_to_disk(nlp: Language) -> Language: + if not callback: + return nlp + modified_nlp = callback(nlp) + if not isinstance(modified_nlp, Language): + err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp)) + raise ValueError(err) + return modified_nlp + + return before_to_disk + + def train_while_improving( nlp: Language, optimizer: Optimizer, diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 5cd97a0eb..6f8c0aa00 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -72,6 +72,8 @@ frozen_components = [] dev_corpus = "corpora.dev" # Location in the config where the train corpus is defined train_corpus = "corpora.train" +# Optional callback before nlp object is saved to disk after training +before_to_disk = null [training.logger] @loggers = "spacy.ConsoleLogger.v1" diff --git a/spacy/errors.py b/spacy/errors.py index dce5cf51c..d67f01a1d 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,9 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E914 = ("Executing {name} callback failed. Expected the function to " + "returnthe nlp object but got: {value}. Maybe you forgot to return " + "the modified object in your function?") E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " "float or int but got: {score_type}. To exclude the score from the " "final score, set its weight to null in the [training.score_weights] " diff --git a/spacy/schemas.py b/spacy/schemas.py index e34841008..6a9a82d06 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -217,6 +217,7 @@ class ConfigSchemaTraining(BaseModel): optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") + before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") # fmt: on class Config: From 138c8d45dbd1372fafe6b280fdedf33790d20d32 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 12:43:39 +0200 Subject: [PATCH 06/15] Update docs --- website/docs/api/data-formats.md | 45 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index e3b3900be..6f156fe37 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | -| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | -| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | -| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | -| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | +| Name | Description | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ | +| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ | +| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ | +| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ | ### pretraining {#config-pretraining tag="section,optional"} @@ -275,8 +276,8 @@ $ python -m spacy convert ./data.json ./output.spacy > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > representing a `PERSON` entity. The -> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function -> can help you convert entity offsets to the right format. +> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can +> help you convert entity offsets to the right format. ```python ### Example structure From 1c63f02f99d6c3d663c4a9cfb0e3395986bd7598 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 12:51:16 +0200 Subject: [PATCH 07/15] Add API docs --- website/docs/api/doc.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 7175f6e7f..e10d9d077 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -219,6 +219,30 @@ alignment mode `"strict". | `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +## Doc.set_ents {#ents tag="method" new="3"} + +Set the named entities in the document. + +> #### Example +> +> ```python +> from spacy.tokens import Span +> doc = nlp("Mr. Best flew to New York on Saturday morning.") +> doc.set_ents([Span(doc, 0, 2, "PERSON")]) +> ents = list(doc.ents) +> assert ents[0].label_ == "PERSON" +> assert ents[0].text == "Mr. Best" +> ``` + +| Name | Description | +| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| entities | Spans with labels to set as entities. ~~List[Span]~~ | +| _keyword-only_ | | +| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ | +| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ | +| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ | +| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ | + ## Doc.similarity {#similarity tag="method" model="vectors"} Make a semantic similarity estimate. The default estimate is cosine similarity @@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied. > ```python > doc = nlp("Mr. Best flew to New York on Saturday morning.") > ents = list(doc.ents) -> assert ents[0].label == 346 > assert ents[0].label_ == "PERSON" > assert ents[0].text == "Mr. Best" > ``` From 6836b664330926a401d05f16fe95cf475febff08 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 13:41:25 +0200 Subject: [PATCH 08/15] Update docs and resolve todos [ci skip] --- website/docs/usage/_benchmarks-models.md | 8 ++++---- website/docs/usage/embeddings-transformers.md | 2 -- website/docs/usage/facts-figures.md | 2 +- website/docs/usage/linguistic-features.md | 9 ++++++--- website/docs/usage/processing-pipelines.md | 7 +++++-- website/docs/usage/projects.md | 5 ++++- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 4b25418b5..5b193d3a4 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -1,10 +1,10 @@ import { Help } from 'components/typography'; import Link from 'components/link' - +
-| System | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | +| Pipeline | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | @@ -21,10 +21,10 @@ import { Help } from 'components/typography'; import Link from 'components/link'
-| Named Entity Recognition Model | OntoNotes | CoNLL '03 | +| Named Entity Recognition System | OntoNotes | CoNLL '03 | | ------------------------------------------------------------------------------ | --------: | --------: | | spaCy RoBERTa (2020) | | 92.2 | -| spaCy CNN (2020) | | 88.4 | +| spaCy CNN (2020) | 85.3 | 88.4 | | spaCy CNN (2017) | 86.4 | | | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)1 | 88.8 | 92.1 | | Flair2 | 89.7 | 93.1 | diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index d61172a5b..b00760e62 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -235,8 +235,6 @@ The `Transformer` component sets the [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, which lets you access the transformers outputs at runtime. - - ```cli $ python -m spacy download en_core_trf_lg ``` diff --git a/website/docs/usage/facts-figures.md b/website/docs/usage/facts-figures.md index 743dae74d..a31559b04 100644 --- a/website/docs/usage/facts-figures.md +++ b/website/docs/usage/facts-figures.md @@ -63,7 +63,7 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
-| System | UAS | LAS | +| Dependency Parsing System | UAS | LAS | | ------------------------------------------------------------------------------ | ---: | ---: | | spaCy RoBERTa (2020)1 | 96.8 | 95.0 | | spaCy CNN (2020)1 | 93.7 | 91.8 | diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 914e18acb..d9a894398 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical component that only provides sentence boundaries. Along with being faster and smaller than the parser, its primary advantage is that it's easier to train because it only requires annotated sentence boundaries rather than full -dependency parses. - - +dependency parses. spaCy's [trained pipelines](/models) include both a parser +and a trained sentence segmenter, which is +[disabled](/usage/processing-pipelines#disabling) by default. If you only need +sentence boundaries and no parser, you can use the `enable` and `disable` +arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and +disable the parser. > #### senter vs. parser > diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 97806dc2a..dbf0881ac 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -253,8 +253,6 @@ different mechanisms you can use: Disabled and excluded component names can be provided to [`spacy.load`](/api/top-level#spacy.load) as a list. - - > #### 💡 Optional pipeline components > > The `disable` mechanism makes it easy to distribute pipeline packages with @@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to > your pipeline may include a statistical _and_ a rule-based component for > sentence segmentation, and you can choose which one to run depending on your > use case. +> +> For example, spaCy's [trained pipelines](/models) like +> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and +> `senter` that perform sentence segmentation, but the `senter` is disabled by +> default. ```python # Load the pipeline without the entity recognizer diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 8e093e8d6..6d5746308 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC. The Prodigy integration will require a nightly version of Prodigy that supports -spaCy v3+. +spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by +exporting your data with +[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running +[`spacy convert`](/api/cli#convert) to convert it to the binary format. From 5c13e0cf1bdf536c54660340e71742bf0493ea07 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 13:41:55 +0200 Subject: [PATCH 09/15] Remove unused error --- spacy/errors.py | 3 --- spacy/tokens/doc.pyx | 2 -- 2 files changed, 5 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 27091810d..998e57f27 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -692,9 +692,6 @@ class Errors: "options: {modes}") E1012 = ("Entity spans and blocked/missing/outside spans should be " "provided to doc.set_ents as lists of `Span` objects.") - E1013 = ("Unable to set entity for span with empty label. Entity spans are " - "required to have a label. To set entity information as missing " - "or blocked, use the keyword arguments with doc.set_ents.") @add_codes diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 670c7440f..b4027f87e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -725,8 +725,6 @@ cdef class Doc: # Set all specified entity information for span in entities: for i in range(span.start, span.end): - if not span.label: - raise ValueError(Errors.E1013) if i == span.start: self.c[i].ent_iob = 3 else: From 92f8b6959a359ff4495205df42f9e86c30aeb8f6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 13:48:41 +0200 Subject: [PATCH 10/15] Fix typo --- spacy/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index d67f01a1d..708b7fda8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -481,7 +481,7 @@ class Errors: # TODO: fix numbering after merging develop into master E914 = ("Executing {name} callback failed. Expected the function to " - "returnthe nlp object but got: {value}. Maybe you forgot to return " + "return the nlp object but got: {value}. Maybe you forgot to return " "the modified object in your function?") E915 = ("Can't use score '{name}' to calculate final weighted score. Expected " "float or int but got: {score_type}. To exclude the score from the " From 88e54caa1275481a43b1069c8ec6d352f554e333 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:32:35 +0200 Subject: [PATCH 11/15] accuracy -> performance --- spacy/cli/info.py | 4 +++- spacy/schemas.py | 3 +-- website/docs/api/data-formats.md | 2 +- website/src/templates/models.js | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 2b87163c2..2f2515278 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -91,7 +91,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]: meta["source"] = str(model_path.resolve()) else: meta["source"] = str(model_path) - return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")} + return { + k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed") + } def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str: diff --git a/spacy/schemas.py b/spacy/schemas.py index e34841008..1ff73bccc 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -182,8 +182,7 @@ class ModelMetaSchema(BaseModel): sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") vectors: Dict[str, Any] = Field({}, title="Included word vectors") labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name") - accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") - speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") + performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") # fmt: on diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index e3b3900be..34565f160 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -518,7 +518,7 @@ source of truth** used for loading a pipeline. > "ner": ["PERSON", "ORG", "PRODUCT"], > "textcat": ["POSITIVE", "NEGATIVE"] > }, -> "accuracy": { +> "performance": { > "ents_f": 82.7300930714, > "ents_p": 82.135523614, > "ents_r": 83.3333333333, diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 5d705048b..413f23dc5 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -117,7 +117,7 @@ function formatModelMeta(data) { license: data.license, labels: isEmptyObj(data.labels) ? null : data.labels, vectors: formatVectors(data.vectors), - accuracy: formatAccuracy(data.accuracy), + accuracy: formatAccuracy(data.performance), } } From 3b58a8be2b32b29a4a121bf0ed75ae3cd2920ee9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:32:42 +0200 Subject: [PATCH 12/15] Update docs --- website/docs/api/data-formats.md | 4 ++-- website/docs/usage/_benchmarks-models.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 34565f160..0fc3481a4 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -275,8 +275,8 @@ $ python -m spacy convert ./data.json ./output.spacy > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > representing a `PERSON` entity. The -> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function -> can help you convert entity offsets to the right format. +> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can +> help you convert entity offsets to the right format. ```python ### Example structure diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md index 5b193d3a4..88e79112f 100644 --- a/website/docs/usage/_benchmarks-models.md +++ b/website/docs/usage/_benchmarks-models.md @@ -7,7 +7,7 @@ import { Help } from 'components/typography'; import Link from 'components/link' | Pipeline | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better | | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | -| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | +| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.1 | 97.4 | 87.0 | 7k | | | `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
From 24e7ac3f2bbdab6a1e124c2770c7545cd08906c8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:43:56 +0200 Subject: [PATCH 13/15] Fix download CLI [ci skip] --- spacy/cli/download.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 036aeab17..0e7ec2ea5 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -88,7 +88,6 @@ def get_compatibility() -> dict: def get_version(model: str, comp: dict) -> str: - model = get_base_version(model) if model not in comp: msg.fail( f"No compatible package found for '{model}' (spaCy v{about.__version__})", From 3f751e68f596d1c186e0baa125a6cba1ff6a7995 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:45:41 +0200 Subject: [PATCH 14/15] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 8d019897b..56b05257a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a23" +__version__ = "3.0.0a24" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 6bc5058d137daa28184c0494f9380b7832770c59 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Sep 2020 14:53:34 +0200 Subject: [PATCH 15/15] Update models directory [ci skip] --- website/src/templates/models.js | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/website/src/templates/models.js b/website/src/templates/models.js index 413f23dc5..cdfe2e46d 100644 --- a/website/src/templates/models.js +++ b/website/src/templates/models.js @@ -32,11 +32,17 @@ const MODEL_META = { las: 'Labelled dependencies', token_acc: 'Tokenization', tok: 'Tokenization', + lemma: 'Statistical lemmatization', + morph: 'Morphological analysis', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tag: 'Part-of-speech tags (fine grained tags, Token.tag)', + pos: 'Part-of-speech tags (coarse grained tags, Token.pos)', ents_f: 'Named entities (F-score)', ents_p: 'Named entities (precision)', ents_r: 'Named entities (recall)', + ner_f: 'Named entities (F-score)', + ner_p: 'Named entities (precision)', + ner_r: 'Named entities (recall)', sent_f: 'Sentence segmentation (F-score)', sent_p: 'Sentence segmentation (precision)', sent_r: 'Sentence segmentation (recall)', @@ -88,11 +94,12 @@ function formatVectors(data) { } function formatAccuracy(data) { + const exclude = ['speed'] if (!data) return [] return Object.keys(data) .map(label => { const value = data[label] - return isNaN(value) + return isNaN(value) || exclude.includes(label) ? null : { label, @@ -109,6 +116,7 @@ function formatModelMeta(data) { version: data.version, sizeFull: data.size, pipeline: data.pipeline, + components: data.components, notes: data.notes, description: data.description, sources: data.sources, @@ -117,7 +125,8 @@ function formatModelMeta(data) { license: data.license, labels: isEmptyObj(data.labels) ? null : data.labels, vectors: formatVectors(data.vectors), - accuracy: formatAccuracy(data.performance), + // TODO: remove accuracy fallback + accuracy: formatAccuracy(data.accuracy || data.performance), } }