diff --git a/spacy/about.py b/spacy/about.py
index 8d019897b..56b05257a 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
-__version__ = "3.0.0a23"
+__version__ = "3.0.0a24"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 036aeab17..0e7ec2ea5 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -88,7 +88,6 @@ def get_compatibility() -> dict:
def get_version(model: str, comp: dict) -> str:
- model = get_base_version(model)
if model not in comp:
msg.fail(
f"No compatible package found for '{model}' (spaCy v{about.__version__})",
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 2b87163c2..2f2515278 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -91,7 +91,9 @@ def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
meta["source"] = str(model_path.resolve())
else:
meta["source"] = str(model_path)
- return {k: v for k, v in meta.items() if k not in ("accuracy", "speed")}
+ return {
+ k: v for k, v in meta.items() if k not in ("accuracy", "performance", "speed")
+ }
def get_markdown(data: Dict[str, Any], title: Optional[str] = None) -> str:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index eabc82be0..6d61c2425 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -97,6 +97,7 @@ def train(
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
batcher = T_cfg["batcher"]
train_logger = T_cfg["logger"]
+ before_to_disk = create_before_to_disk_callback(T_cfg["before_to_disk"])
# Components that shouldn't be updated during training
frozen_components = T_cfg["frozen_components"]
# Sourced components that require resume_training
@@ -167,6 +168,7 @@ def train(
with nlp.select_pipes(disable=frozen_components):
update_meta(T_cfg, nlp, info)
with nlp.use_params(optimizer.averages):
+ nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
progress.set_description(f"Epoch {info['epoch']}")
@@ -179,6 +181,7 @@ def train(
f"Aborting and saving the final best model. "
f"Encountered exception: {str(e)}"
)
+ nlp = before_to_disk(nlp)
nlp.to_disk(output_path / "model-final")
raise e
finally:
@@ -233,6 +236,21 @@ def create_evaluation_callback(
return evaluate
+def create_before_to_disk_callback(
+ callback: Optional[Callable[[Language], Language]]
+) -> Callable[[Language], Language]:
+ def before_to_disk(nlp: Language) -> Language:
+ if not callback:
+ return nlp
+ modified_nlp = callback(nlp)
+ if not isinstance(modified_nlp, Language):
+ err = Errors.E914.format(name="before_to_disk", value=type(modified_nlp))
+ raise ValueError(err)
+ return modified_nlp
+
+ return before_to_disk
+
+
def train_while_improving(
nlp: Language,
optimizer: Optimizer,
diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 5cd97a0eb..6f8c0aa00 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -72,6 +72,8 @@ frozen_components = []
dev_corpus = "corpora.dev"
# Location in the config where the train corpus is defined
train_corpus = "corpora.train"
+# Optional callback before nlp object is saved to disk after training
+before_to_disk = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
diff --git a/spacy/errors.py b/spacy/errors.py
index dce5cf51c..6fdf8cb57 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -480,6 +480,9 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
+ E914 = ("Executing {name} callback failed. Expected the function to "
+ "return the nlp object but got: {value}. Maybe you forgot to return "
+ "the modified object in your function?")
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
"float or int but got: {score_type}. To exclude the score from the "
"final score, set its weight to null in the [training.score_weights] "
@@ -693,6 +696,12 @@ class Errors:
E1009 = ("String for hash '{val}' not found in StringStore. Set the value "
"through token.morph_ instead or add the string to the "
"StringStore with `nlp.vocab.strings.add(string)`.")
+ E1010 = ("Unable to set entity information for token {i} which is included "
+ "in more than one span in entities, blocked, missing or outside.")
+ E1011 = ("Unsupported default '{default}' in doc.set_ents. Available "
+ "options: {modes}")
+ E1012 = ("Entity spans and blocked/missing/outside spans should be "
+ "provided to doc.set_ents as lists of `Span` objects.")
@add_codes
diff --git a/spacy/schemas.py b/spacy/schemas.py
index e34841008..eea6639d3 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -182,8 +182,7 @@ class ModelMetaSchema(BaseModel):
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
vectors: Dict[str, Any] = Field({}, title="Included word vectors")
labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
- accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
- speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
+ performance: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy and speed numbers")
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")
# fmt: on
@@ -217,6 +216,7 @@ class ConfigSchemaTraining(BaseModel):
optimizer: Optimizer = Field(..., title="The optimizer to use")
logger: Logger = Field(..., title="The logger to track training progress")
frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+ before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
# fmt: on
class Config:
diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
index 40aff8e31..615ab9e5b 100644
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@@ -29,10 +29,10 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner.begin_training(lambda: [_ner_example(ner)])
ner(doc)
- doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+ doc.ents = [("ANIMAL", 3, 4)]
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
- doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+ doc.ents = [("WORD", 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 163de5ab0..e5e72fe2a 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -152,7 +152,7 @@ def test_doc_api_set_ents(en_tokenizer):
assert len(tokens.ents) == 0
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
assert len(list(tokens.ents)) == 1
- assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
+ assert [t.ent_iob for t in tokens] == [2, 2, 3, 1, 2, 2, 2, 2]
assert tokens.ents[0].label_ == "PRODUCT"
assert tokens.ents[0].start == 2
assert tokens.ents[0].end == 4
@@ -427,7 +427,7 @@ def test_has_annotation(en_vocab):
doc[0].lemma_ = "a"
doc[0].dep_ = "dep"
doc[0].head = doc[1]
- doc.ents = [Span(doc, 0, 1, label="HELLO")]
+ doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")
for attr in attrs:
assert doc.has_annotation(attr)
@@ -457,7 +457,74 @@ def test_is_flags_deprecated(en_tokenizer):
doc.is_sentenced
-def test_doc_set_ents():
+def test_doc_set_ents(en_tokenizer):
+ # set ents
+ doc = en_tokenizer("a b c d e")
+ doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+ assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 2]
+ assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+ # add ents, invalid IOB repaired
+ doc = en_tokenizer("a b c d e")
+ doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)])
+ doc.set_ents([Span(doc, 0, 2, 12)], default="unmodified")
+ assert [t.ent_iob for t in doc] == [3, 1, 3, 2, 2]
+ assert [t.ent_type for t in doc] == [12, 12, 11, 0, 0]
+
+ # missing ents
+ doc = en_tokenizer("a b c d e")
+ doc.set_ents([Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)], missing=[doc[4:5]])
+ assert [t.ent_iob for t in doc] == [3, 3, 1, 2, 0]
+ assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+ # outside ents
+ doc = en_tokenizer("a b c d e")
+ doc.set_ents(
+ [Span(doc, 0, 1, 10), Span(doc, 1, 3, 11)],
+ outside=[doc[4:5]],
+ default="missing",
+ )
+ assert [t.ent_iob for t in doc] == [3, 3, 1, 0, 2]
+ assert [t.ent_type for t in doc] == [10, 11, 11, 0, 0]
+
+ # blocked ents
+ doc = en_tokenizer("a b c d e")
+ doc.set_ents([], blocked=[doc[1:2], doc[3:5]], default="unmodified")
+ assert [t.ent_iob for t in doc] == [0, 3, 0, 3, 3]
+ assert [t.ent_type for t in doc] == [0, 0, 0, 0, 0]
+ assert doc.ents == tuple()
+
+ # invalid IOB repaired after blocked
+ doc.ents = [Span(doc, 3, 5, "ENT")]
+ assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 1]
+ doc.set_ents([], blocked=[doc[3:4]], default="unmodified")
+ assert [t.ent_iob for t in doc] == [2, 2, 2, 3, 3]
+
+ # all types
+ doc = en_tokenizer("a b c d e")
+ doc.set_ents(
+ [Span(doc, 0, 1, 10)],
+ blocked=[doc[1:2]],
+ missing=[doc[2:3]],
+ outside=[doc[3:4]],
+ default="unmodified",
+ )
+ assert [t.ent_iob for t in doc] == [3, 3, 0, 2, 0]
+ assert [t.ent_type for t in doc] == [10, 0, 0, 0, 0]
+
+ doc = en_tokenizer("a b c d e")
+ # single span instead of a list
+ with pytest.raises(ValueError):
+ doc.set_ents([], missing=doc[1:2])
+ # invalid default mode
+ with pytest.raises(ValueError):
+ doc.set_ents([], missing=[doc[1:2]], default="none")
+ # conflicting/overlapping specifications
+ with pytest.raises(ValueError):
+ doc.set_ents([], missing=[doc[1:2]], outside=[doc[1:2]])
+
+
+def test_doc_ents_setter():
"""Test that both strings and integers can be used to set entities in
tuple format via doc.ents."""
words = ["a", "b", "c", "d", "e"]
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 548cd2697..cd5581769 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -168,7 +168,7 @@ def test_accept_blocked_token():
ner2 = nlp2.create_pipe("ner", config=config)
# set "New York" to a blocked entity
- doc2.ents = [(0, 3, 5)]
+ doc2.set_ents([], blocked=[doc2[3:5]], default="unmodified")
assert [token.ent_iob_ for token in doc2] == ["", "", "", "B", "B"]
assert [token.ent_type_ for token in doc2] == ["", "", "", "", ""]
@@ -358,5 +358,5 @@ class BlockerComponent1:
self.name = name
def __call__(self, doc):
- doc.ents = [(0, self.start, self.end)]
+ doc.set_ents([], blocked=[doc[self.start:self.end]], default="unmodified")
return doc
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index b82bab294..b4027f87e 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, uint64_t
import copy
from collections import Counter
+from enum import Enum
+import itertools
import numpy
import srsly
from thinc.api import get_array_module
@@ -86,6 +88,17 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name)
return get_token_attr(token, feat_name)
+class SetEntsDefault(str, Enum):
+ blocked = "blocked"
+ missing = "missing"
+ outside = "outside"
+ unmodified = "unmodified"
+
+ @classmethod
+ def values(cls):
+ return list(cls.__members__.keys())
+
+
cdef class Doc:
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
@@ -660,50 +673,100 @@ cdef class Doc:
# TODO:
# 1. Test basic data-driven ORTH gazetteer
# 2. Test more nuanced date and currency regex
- tokens_in_ents = {}
- cdef attr_t entity_type
- cdef attr_t kb_id
- cdef int ent_start, ent_end, token_index
+ cdef attr_t entity_type, kb_id
+ cdef int ent_start, ent_end
+ ent_spans = []
for ent_info in ents:
entity_type_, kb_id, ent_start, ent_end = get_entity_info(ent_info)
if isinstance(entity_type_, str):
self.vocab.strings.add(entity_type_)
- entity_type = self.vocab.strings.as_int(entity_type_)
- for token_index in range(ent_start, ent_end):
- if token_index in tokens_in_ents:
- raise ValueError(Errors.E103.format(
- span1=(tokens_in_ents[token_index][0],
- tokens_in_ents[token_index][1],
- self.vocab.strings[tokens_in_ents[token_index][2]]),
- span2=(ent_start, ent_end, self.vocab.strings[entity_type])))
- tokens_in_ents[token_index] = (ent_start, ent_end, entity_type, kb_id)
- cdef int i
+ span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id)
+ ent_spans.append(span)
+ self.set_ents(ent_spans, default=SetEntsDefault.outside)
+
+ def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside):
+ """Set entity annotation.
+
+ entities (List[Span]): Spans with labels to set as entities.
+ blocked (Optional[List[Span]]): Spans to set as 'blocked' (never an
+ entity) for spacy's built-in NER component. Other components may
+ ignore this setting.
+ missing (Optional[List[Span]]): Spans with missing/unknown entity
+ information.
+ outside (Optional[List[Span]]): Spans outside of entities (O in IOB).
+ default (str): How to set entity annotation for tokens outside of any
+ provided spans. Options: "blocked", "missing", "outside" and
+ "unmodified" (preserve current state). Defaults to "outside".
+ """
+ if default not in SetEntsDefault.values():
+ raise ValueError(Errors.E1011.format(default=default, modes=", ".join(SetEntsDefault)))
+
+ # Ignore spans with missing labels
+ entities = [ent for ent in entities if ent.label > 0]
+
+ if blocked is None:
+ blocked = tuple()
+ if missing is None:
+ missing = tuple()
+ if outside is None:
+ outside = tuple()
+
+ # Find all tokens covered by spans and check that none are overlapping
+ cdef int i
+ seen_tokens = set()
+ for span in itertools.chain.from_iterable([entities, blocked, missing, outside]):
+ if not isinstance(span, Span):
+ raise ValueError(Errors.E1012.format(span=span))
+ for i in range(span.start, span.end):
+ if i in seen_tokens:
+ raise ValueError(Errors.E1010.format(i=i))
+ seen_tokens.add(i)
+
+ # Set all specified entity information
+ for span in entities:
+ for i in range(span.start, span.end):
+ if i == span.start:
+ self.c[i].ent_iob = 3
+ else:
+ self.c[i].ent_iob = 1
+ self.c[i].ent_type = span.label
+ self.c[i].ent_kb_id = span.kb_id
+ for span in blocked:
+ for i in range(span.start, span.end):
+ self.c[i].ent_iob = 3
+ self.c[i].ent_type = 0
+ for span in missing:
+ for i in range(span.start, span.end):
+ self.c[i].ent_iob = 0
+ self.c[i].ent_type = 0
+ for span in outside:
+ for i in range(span.start, span.end):
+ self.c[i].ent_iob = 2
+ self.c[i].ent_type = 0
+
+ # Set tokens outside of all provided spans
+ if default != SetEntsDefault.unmodified:
for i in range(self.length):
- # default values
- entity_type = 0
- kb_id = 0
+ if i not in seen_tokens:
+ self.c[i].ent_type = 0
+ if default == SetEntsDefault.outside:
+ self.c[i].ent_iob = 2
+ elif default == SetEntsDefault.missing:
+ self.c[i].ent_iob = 0
+ elif default == SetEntsDefault.blocked:
+ self.c[i].ent_iob = 3
- # Set ent_iob to Missing (0) by default unless this token was nered before
- ent_iob = 0
- if self.c[i].ent_iob != 0:
- ent_iob = 2
-
- # overwrite if the token was part of a specified entity
- if i in tokens_in_ents.keys():
- ent_start, ent_end, entity_type, kb_id = tokens_in_ents[i]
- if entity_type is None or entity_type <= 0:
- # Blocking this token from being overwritten by downstream NER
- ent_iob = 3
- elif ent_start == i:
- # Marking the start of an entity
- ent_iob = 3
- else:
- # Marking the inside of an entity
- ent_iob = 1
-
- self.c[i].ent_type = entity_type
- self.c[i].ent_kb_id = kb_id
- self.c[i].ent_iob = ent_iob
+ # Fix any resulting inconsistent annotation
+ for i in range(self.length - 1):
+ # I must follow B or I: convert I to B
+ if (self.c[i].ent_iob == 0 or self.c[i].ent_iob == 2) and \
+ self.c[i+1].ent_iob == 1:
+ self.c[i+1].ent_iob = 3
+ # Change of type with BI or II: convert second I to B
+ if self.c[i].ent_type != self.c[i+1].ent_type and \
+ (self.c[i].ent_iob == 3 or self.c[i].ent_iob == 1) and \
+ self.c[i+1].ent_iob == 1:
+ self.c[i+1].ent_iob = 3
@property
def noun_chunks(self):
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index fbf05b224..1e7bea5df 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -288,6 +288,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
def _add_entities_to_doc(doc, ner_data):
+ print(ner_data)
if ner_data is None:
return
elif ner_data == []:
@@ -303,9 +304,14 @@ def _add_entities_to_doc(doc, ner_data):
biluo_tags_to_spans(doc, ner_data)
)
elif isinstance(ner_data[0], Span):
- # Ugh, this is super messy. Really hard to set O entities
- doc.ents = ner_data
- doc.ents = [span for span in ner_data if span.label_]
+ entities = []
+ missing = []
+ for span in ner_data:
+ if span.label:
+ entities.append(span)
+ else:
+ missing.append(span)
+ doc.set_ents(entities, missing=missing)
else:
raise ValueError(Errors.E973)
diff --git a/spacy/training/iob_utils.py b/spacy/training/iob_utils.py
index 91fc40205..0e8e7eed0 100644
--- a/spacy/training/iob_utils.py
+++ b/spacy/training/iob_utils.py
@@ -151,9 +151,10 @@ def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
- token. Each tags string will be of the form of either "", "O" or
+ token. Each tag string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
- RETURNS (list): A sequence of Span objects.
+ RETURNS (list): A sequence of Span objects. Each token with a missing IOB
+ tag is returned as a Span with an empty label.
"""
token_offsets = tags_to_entities(tags)
spans = []
@@ -186,22 +187,18 @@ def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
entities = []
start = None
for i, tag in enumerate(tags):
- if tag is None:
- continue
- if tag.startswith("O"):
+ if tag is None or tag.startswith("-"):
# TODO: We shouldn't be getting these malformed inputs. Fix this.
if start is not None:
start = None
else:
entities.append(("", i, i))
- continue
- elif tag == "-":
- continue
+ elif tag.startswith("O"):
+ pass
elif tag.startswith("I"):
if start is None:
raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
- continue
- if tag.startswith("U"):
+ elif tag.startswith("U"):
entities.append((tag[2:], i, i))
elif tag.startswith("B"):
start = i
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index e3b3900be..420c09237 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -180,26 +180,27 @@ single corpus once and then divide it up into `train` and `dev` partitions.
This section defines settings and controls for the training and evaluation
process that are used when you run [`spacy train`](/api/cli#train).
-| Name | Description |
-| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
-| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
-| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
-| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
-| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
-| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
-| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
-| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
-| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
-| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
-| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
-| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
-| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
-| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
-| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
-| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
-| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
-| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
+| Name | Description |
+| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
+| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
+| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
+| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
+| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
+| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
+| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
+| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
+| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
+| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
+| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
+| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
+| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
+| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
+| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
+| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
+| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
+| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
+| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
### pretraining {#config-pretraining tag="section,optional"}
@@ -275,8 +276,8 @@ $ python -m spacy convert ./data.json ./output.spacy
> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
> representing a `PERSON` entity. The
-> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
-> can help you convert entity offsets to the right format.
+> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function can
+> help you convert entity offsets to the right format.
```python
### Example structure
@@ -518,7 +519,7 @@ source of truth** used for loading a pipeline.
> "ner": ["PERSON", "ORG", "PRODUCT"],
> "textcat": ["POSITIVE", "NEGATIVE"]
> },
-> "accuracy": {
+> "performance": {
> "ents_f": 82.7300930714,
> "ents_p": 82.135523614,
> "ents_r": 83.3333333333,
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 7175f6e7f..e10d9d077 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -219,6 +219,30 @@ alignment mode `"strict".
| `alignment_mode` | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
+## Doc.set_ents {#ents tag="method" new="3"}
+
+Set the named entities in the document.
+
+> #### Example
+>
+> ```python
+> from spacy.tokens import Span
+> doc = nlp("Mr. Best flew to New York on Saturday morning.")
+> doc.set_ents([Span(doc, 0, 2, "PERSON")])
+> ents = list(doc.ents)
+> assert ents[0].label_ == "PERSON"
+> assert ents[0].text == "Mr. Best"
+> ```
+
+| Name | Description |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| entities | Spans with labels to set as entities. ~~List[Span]~~ |
+| _keyword-only_ | |
+| blocked | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~ |
+| missing | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~ |
+| outside | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~ |
+| default | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
+
## Doc.similarity {#similarity tag="method" model="vectors"}
Make a semantic similarity estimate. The default estimate is cosine similarity
@@ -542,7 +566,6 @@ objects, if the entity recognizer has been applied.
> ```python
> doc = nlp("Mr. Best flew to New York on Saturday morning.")
> ents = list(doc.ents)
-> assert ents[0].label == 346
> assert ents[0].label_ == "PERSON"
> assert ents[0].text == "Mr. Best"
> ```
diff --git a/website/docs/usage/_benchmarks-models.md b/website/docs/usage/_benchmarks-models.md
index 5b193d3a4..88e79112f 100644
--- a/website/docs/usage/_benchmarks-models.md
+++ b/website/docs/usage/_benchmarks-models.md
@@ -7,7 +7,7 @@ import { Help } from 'components/typography'; import Link from 'components/link'
| Pipeline | Parser | Tagger | NER | WPS
CPU words per second on CPU, higher is better | WPS
GPU words per second on GPU, higher is better |
| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
-| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | 92.1 | 97.4 | 87.0 | 7k | |
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 5d705048b..cdfe2e46d 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -32,11 +32,17 @@ const MODEL_META = {
las: 'Labelled dependencies',
token_acc: 'Tokenization',
tok: 'Tokenization',
+ lemma: 'Statistical lemmatization',
+ morph: 'Morphological analysis',
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
+ pos: 'Part-of-speech tags (coarse grained tags, Token.pos)',
ents_f: 'Named entities (F-score)',
ents_p: 'Named entities (precision)',
ents_r: 'Named entities (recall)',
+ ner_f: 'Named entities (F-score)',
+ ner_p: 'Named entities (precision)',
+ ner_r: 'Named entities (recall)',
sent_f: 'Sentence segmentation (F-score)',
sent_p: 'Sentence segmentation (precision)',
sent_r: 'Sentence segmentation (recall)',
@@ -88,11 +94,12 @@ function formatVectors(data) {
}
function formatAccuracy(data) {
+ const exclude = ['speed']
if (!data) return []
return Object.keys(data)
.map(label => {
const value = data[label]
- return isNaN(value)
+ return isNaN(value) || exclude.includes(label)
? null
: {
label,
@@ -109,6 +116,7 @@ function formatModelMeta(data) {
version: data.version,
sizeFull: data.size,
pipeline: data.pipeline,
+ components: data.components,
notes: data.notes,
description: data.description,
sources: data.sources,
@@ -117,7 +125,8 @@ function formatModelMeta(data) {
license: data.license,
labels: isEmptyObj(data.labels) ? null : data.labels,
vectors: formatVectors(data.vectors),
- accuracy: formatAccuracy(data.accuracy),
+ // TODO: remove accuracy fallback
+ accuracy: formatAccuracy(data.accuracy || data.performance),
}
}