Merge branch 'develop' into feature/spacy-legacy

2025-11-17 08:16:04 +03:00 · 2021-01-18 11:43:39 +11:00 · 2021-01-18 11:43:39 +11:00 · 1090d3d675
commit 1090d3d675
parent a552db2819 09cacbb7ee
30 changed files with 29035 additions and 28725 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -463,12 +463,14 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")
    # TODO: fix numbering after merging develop into master
    E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
    E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
            "If you're using a custom function, make sure the code is available. "
            "If the function is provided by a third-party package, e.g. "
            "spacy-transformers, make sure the package is installed in your "
            "environment.\n\nAvailable names: {available}")
-    E894 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
+    E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
            "'{lang}'.")
    E895 = ("The 'textcat' component received gold-standard annotations with "
            "multiple labels per document. In spaCy 3 you should use the "
            "'textcat_multilabel' component for this instead. "
--- a/spacy/lang/he/lex_attrs.py
+++ b/spacy/lang/he/lex_attrs.py
@ -86,7 +86,7 @@ def like_num(text):
    if text in _num_words:
        return True
-    # CHeck ordinal number
+    # Check ordinal number
    if text in _ordinal_words:
        return True
    return False
--- a/spacy/lang/mk/lemmatizer.py
+++ b/spacy/lang/mk/lemmatizer.py
@ -18,8 +18,6 @@ class MacedonianLemmatizer(Lemmatizer):
            string = string[:-3]
            univ_pos = "verb"
        if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
            return [string.lower()]
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
--- a/spacy/language.py
+++ b/spacy/language.py
@ -697,6 +697,8 @@ class Language:
        source_config = source.config.interpolate()
        pipe_config = util.copy_config(source_config["components"][source_name])
        self._pipe_configs[name] = pipe_config
        for s in source.vocab.strings:
            self.vocab.strings.add(s)
        return pipe, pipe_config["factory"]
    def add_pipe(
@ -1619,9 +1621,7 @@ class Language:
                    if model not in source_nlps:
                        # We only need the components here and we need to init
                        # model with the same vocab as the current nlp object
-                        source_nlps[model] = util.load_model(
+                        source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
                            model, vocab=nlp.vocab, disable=["vocab", "tokenizer"]
                        )
                    source_name = pipe_cfg.get("component", pipe_name)
                    nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
        disabled_pipes = [*config["nlp"]["disabled"], *disable]
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -197,13 +197,39 @@ cdef class ArcEagerGold:
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
-        sent_starts = example.get_aligned_sent_starts()
+        sent_starts = _get_aligned_sent_starts(example)
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)
    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls.c)
 def _get_aligned_sent_starts(example):
    """Get list of SENT_START attributes aligned to the predicted tokenization.
    If the reference has not sentence starts, return a list of None values.
    This function is slightly different from the one on Example, because we also
    check whether the reference sentences align across multiple sentences,
    and return missing values if they do. This prevents a problem where you have
    the start of a sentence merged onto a token that belongs to two sentences.
    """
    if example.y.has_annotation("SENT_START"):
        align = example.alignment.y2x
        sent_starts = [False] * len(example.x)
        seen_words = set()
        for y_sent in example.y.sents:
            x_indices = list(align[y_sent.start : y_sent.end].dataXd)
            if any(x_idx in seen_words for x_idx in x_indices):
                # If there are any tokens in X that align across two sentences,
                # regard the sentence annotations as missing, as we can't
                # reliably use them.
                return [None] * len(example.x)
            seen_words.update(x_indices)
            sent_starts[x_indices[0]] = True
        return sent_starts
    else:
        return [None] * len(example.x)
 cdef int check_state_gold(char state_bits, char flag) nogil:
    cdef char one = 1
@ -820,7 +846,7 @@ cdef class ArcEager(TransitionSystem):
            else:
                failed = False
                break
-        if failed:
+        if failed and _debug not in (False, None):
            example = _debug
            print("Actions")
            for i in range(self.n_moves):
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -1,7 +1,11 @@
 import srsly
 from thinc.api import Config
 from typing import Dict, Any
 from ..language import Language
 from ..matcher import Matcher
 from ..tokens import Doc
 from ..util import filter_spans
 from .. import util
@Language.component(
@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
        for span in spans:
            retokenizer.merge(span)
    return doc
@Language.factory(
    "token_splitter",
    default_config={"min_length": 25, "split_length": 10},
    retokenizes=True,
 )
 def make_token_splitter(
    nlp: Language,
    name: str,
    *,
    min_length=0,
    split_length=0,
 ):
    return TokenSplitter(
        min_length=min_length, split_length=split_length
    )
 class TokenSplitter:
    def __init__(self, min_length: int = 0, split_length: int = 0):
        self.min_length = min_length
        self.split_length = split_length
    def __call__(self, doc: Doc) -> Doc:
        if self.min_length > 0 and self.split_length > 0:
            with doc.retokenize() as retokenizer:
                for t in doc:
                    if len(t.text) >= self.min_length:
                        orths = []
                        heads = []
                        attrs = {}
                        for i in range(0, len(t.text), self.split_length):
                            orths.append(t.text[i : i + self.split_length])
                            heads.append((t, i / self.split_length))
                        retokenizer.split(t, orths, heads, attrs)
        return doc
    def _get_config(self) -> Dict[str, Any]:
        return {
            "min_length": self.min_length,
            "split_length": self.split_length,
        }
    def _set_config(self, config: Dict[str, Any] = {}) -> None:
        self.min_length = config.get("min_length", 0)
        self.split_length = config.get("split_length", 0)
    def to_bytes(self, **kwargs):
        serializers = {
            "cfg": lambda: srsly.json_dumps(self._get_config()),
        }
        return util.to_bytes(serializers, [])
    def from_bytes(self, data, **kwargs):
        deserializers = {
            "cfg": lambda b: self._set_config(srsly.json_loads(b)),
        }
        util.from_bytes(data, deserializers, [])
        return self
    def to_disk(self, path, **kwargs):
        path = util.ensure_path(path)
        serializers = {
            "cfg": lambda p: srsly.write_json(p, self._get_config()),
        }
        return util.to_disk(path, serializers, [])
    def from_disk(self, path, **kwargs):
        path = util.ensure_path(path)
        serializers = {
            "cfg": lambda p: self._set_config(srsly.read_json(p)),
        }
        util.from_disk(path, serializers, [])
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -145,6 +145,10 @@ class Morphologizer(Tagger):
            for example in get_examples():
                for i, token in enumerate(example.reference):
                    pos = token.pos_
                    # if both are unset, annotation is missing, so do not add
                    # an empty label
                    if pos == "" and not token.has_morph():
                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
                    morph_dict = Morphology.feats_to_dict(morph)
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
                    if norm_label not in self.cfg["labels_morph"]:
                        self.cfg["labels_morph"][norm_label] = morph
                        self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
-        if len(self.labels) <= 1:
+        if len(self.labels) < 1:
            raise ValueError(Errors.E143.format(name=self.name))
        doc_sample = []
        label_sample = []
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
                pos = pos_tags[i]
                morph = morphs[i]
                # POS may align (same value for multiple tokens) when morph
-                # doesn't, so if either is None, treat both as None here so that
+                # doesn't, so if either is misaligned (None), treat the
-                # truths doesn't end up with an unknown morph+POS combination
+                # annotation as missing so that truths doesn't end up with an
                # unknown morph+POS combination
                if pos is None or morph is None:
                    label = None
                # If both are unset, the annotation is missing (empty morph
                # converted from int is "_" rather than "")
                elif pos == "" and morph == "":
                    label = None
                # Otherwise, generate the combined label
                else:
                    label_dict = Morphology.feats_to_dict(morph)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
                    # As a fail-safe, skip any unrecognized labels
                    if label not in self.labels:
                        label = None
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,8 @@ import pytest
 import numpy
 import logging
 import mock
 from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.lexeme import Lexeme
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
        doc.ents = spans
 def test_doc_noun_chunks_not_implemented():
    """Test that a language without noun_chunk iterator, throws a NotImplementedError"""
    text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
    nlp = MultiLanguage()
    doc = nlp(text)
    with pytest.raises(NotImplementedError):
        chunks = list(doc.noun_chunks)
 def test_span_groups(en_tokenizer):
    doc = en_tokenizer("Some text about Colombia and the Czech Republic")
    doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@ -1,11 +1,16 @@
 import numpy
 from spacy.attrs import HEAD, DEP
 from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.tokens import Doc
 import pytest
@pytest.fixture
 def doc(en_vocab):
    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
    heads = [1, 1, 6, 6, 3, 3, 1]
    deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
    pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
    return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
 def test_noun_chunks_is_parsed(en_tokenizer):
    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
    doc = en_tokenizer("This is a sentence")
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
        list(doc.noun_chunks)
-def test_en_noun_chunks_not_nested(en_vocab):
+def test_en_noun_chunks_not_nested(doc, en_vocab):
-    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
+    """Test that each token only appears in one noun chunk at most"""
    heads = [1, 1, 6, 6, 3, 3, 1]
    deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
    doc.from_array(
        [HEAD, DEP],
        numpy.asarray(
            [
                [1, nsubj],
                [0, root],
                [4, amod],
                [3, nmod],
                [-1, cc],
                [-2, conj],
                [-5, dobj],
            ],
            dtype="uint64",
        ),
    )
    doc.noun_chunks_iterator = noun_chunks
    word_occurred = {}
-    for chunk in doc.noun_chunks:
+    chunks = list(doc.noun_chunks)
    assert len(chunks) > 1
    for chunk in chunks:
        for word in chunk:
            word_occurred.setdefault(word.text, 0)
            word_occurred[word.text] += 1
    assert len(word_occurred) > 0
    for word, freq in word_occurred.items():
        assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
 def test_noun_chunks_span(doc, en_tokenizer):
    """Test that the span.noun_chunks property works correctly"""
    doc_chunks = list(doc.noun_chunks)
    span = doc[0:3]
    span_chunks = list(span.noun_chunks)
    assert 0 < len(span_chunks) < len(doc_chunks)
    for chunk in span_chunks:
        assert chunk in doc_chunks
        assert chunk.start >= 0
        assert chunk.end <= 3
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2):
    assert len(doc2) == 6
    assert len(list(doc2.ents)) == 1
    assert doc2[2].text == "New York"
 def test_token_splitter():
    nlp = Language()
    config = {"min_length": 20, "split_length": 5}
    token_splitter = nlp.add_pipe("token_splitter", config=config)
    doc = nlp("aaaaabbbbbcccccdddd e f g")
    assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"]
    doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i")
    assert [t.text for t in doc] == [
        "aaaaa",
        "bbbbb",
        "ccccc",
        "ddddd",
        "eeeee",
        "ff",
        "g",
        "h",
        "i",
    ]
    assert all(len(t.text) <= token_splitter.split_length for t in doc)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -136,3 +136,28 @@ def test_overfitting_IO():
    gold_pos_tags = ["", "", "", ""]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
    # Test with unset morph and partial POS
    nlp.remove_pipe("morphologizer")
    nlp.add_pipe("morphologizer")
    for example in train_examples:
        for token in example.reference:
            if token.text == "ham":
                token.pos_ = "NOUN"
            else:
                token.pos_ = ""
            token.set_morph(None)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    print(nlp.get_pipe("morphologizer").labels)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["morphologizer"] < 0.00001
    # Test the trained model
    test_text = "I like blue ham"
    doc = nlp(test_text)
    gold_morphs = ["", "", "", ""]
    gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -81,7 +81,8 @@ def test_issue3199():
    """
    words = ["This", "is", "a", "sentence"]
    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
-    assert list(doc[0:3].noun_chunks) == []
+    with pytest.raises(NotImplementedError):
        list(doc[0:3].noun_chunks)
 def test_issue3209():
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -816,8 +816,10 @@ cdef class Doc:
    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the document. Yields base
-        noun-phrase #[code Span] objects, if the document has been
+        noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
-        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        Raises a NotImplementedError otherwise.
        A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.
@ -826,14 +828,15 @@ cdef class Doc:
        DOCS: https://nightly.spacy.io/api/doc#noun_chunks
        """
        if self.noun_chunks_iterator is None:
            raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))
        # Accumulate the result before beginning to iterate over it. This
-        # prevents the tokenisation from being changed out from under us
+        # prevents the tokenization from being changed out from under us
        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenisation changing, so it's okay once we have the Span
+        # its tokenization changing, so it's okay once we have the Span
        # objects. See Issue #375.
        spans = []
        if self.noun_chunks_iterator is not None:
        for start, end, label in self.noun_chunks_iterator(self):
            spans.append(Span(self, start, end, label=label))
        for span in spans:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -487,29 +487,24 @@ cdef class Span:
        """
        return "".join([t.text_with_ws for t in self])
    @property
    def noun_chunks(self):
-        """Yields base noun-phrase `Span` objects, if the document has been
+        """Iterate over the base noun phrases in the span. Yields base
-        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
        Raises a NotImplementedError otherwise.
        A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.
-        YIELDS (Span): Base noun-phrase `Span` objects.
+        YIELDS (Span): Noun chunks in the span.
        DOCS: https://nightly.spacy.io/api/span#noun_chunks
        """
-        # Accumulate the result before beginning to iterate over it. This
+        for span in self.doc.noun_chunks:
-        # prevents the tokenisation from being changed out from under us
+            if span.start >= self.start and span.end <= self.end:
        # during the iteration. The tricky thing here is that Span accepts
        # its tokenisation changing, so it's okay once we have the Span
        # objects. See Issue #375
        spans = []
        cdef attr_t label
        if self.doc.noun_chunks_iterator is not None:
            for start, end, label in self.doc.noun_chunks_iterator(self):
                spans.append(Span(self.doc, start, end, label=label))
        for span in spans:
                yield span
    @property
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -211,6 +211,14 @@ cdef class Token:
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
    def has_morph(self):
        """Check whether the token has annotated morph information.
        Return False when the morph annotation is unset/missing.
        RETURNS (bool): Whether the morph annotation is set.
        """
        return not self.c.morph == 0
    property morph:
        def __get__(self):
            return MorphAnalysis.from_id(self.vocab, self.c.morph)
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -200,10 +200,6 @@ cdef class Example:
    def get_aligned_sent_starts(self):
        """Get list of SENT_START attributes aligned to the predicted tokenization.
        If the reference has not sentence starts, return a list of None values.
        The aligned sentence starts use the get_aligned_spans method, rather
        than aligning the list of tags, so that it handles cases where a mistaken
        tokenization starts the sentence.
        """
        if self.y.has_annotation("SENT_START"):
            align = self.alignment.y2x
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
 nested within it – so no NP-level coordination, no prepositional phrases, and no
 relative clauses.
 If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
 not been implemeted for the given language, a `NotImplementedError` is raised.
 > #### Example
 >
 > ```python
 > doc = nlp("A phrase with another phrase occurs.")
 > chunks = list(doc.noun_chunks)
 > assert len(chunks) == 2
 > assert chunks[0].text == "A phrase"
 > assert chunks[1].text == "another phrase"
 > ```
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@ -6,6 +6,7 @@ menu:
  - ['merge_noun_chunks', 'merge_noun_chunks']
  - ['merge_entities', 'merge_entities']
  - ['merge_subtokens', 'merge_subtokens']
  - ['token_splitter', 'token_splitter']
 ---
 ## merge_noun_chunks {#merge_noun_chunks tag="function"}
@ -107,3 +108,25 @@ end of the pipeline and after all other components.
 | `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
 | `label`     | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~       |
 | **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~                    |
 ## token_splitter {#token_splitter tag="function" new="3.0"}
 Split tokens longer than a minimum length into shorter tokens. Intended for use
 with transformer pipelines where long spaCy tokens lead to input text that
 exceed the transformer model max length. See
 [managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length).
 > #### Example
 >
 > ```python
 > config={"min_length": 20, "split_length": 5}
 > nlp.add_pipe("token_splitter", config=config, first=True)
 > doc = nlp("aaaaabbbbbcccccdddddee")
 > print([token.text for token in doc])
 > # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee']
 > ```
 | Setting        | Description                                                           |
 | -------------- | --------------------------------------------------------------------- |
 | `min_length`   | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
 | `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~              |
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -274,6 +274,31 @@ if the entity recognizer has been applied.
 | ----------- | ----------------------------------------------------------------- |
 | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
 ## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
 Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
 objects, if the document has been syntactically parsed. A base noun phrase, or
 "NP chunk", is a noun phrase that does not permit other NPs to be nested within
 it – so no NP-level coordination, no prepositional phrases, and no relative
 clauses.
 If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
 not been implemeted for the given language, a `NotImplementedError` is raised.
 > #### Example
 >
 > ```python
 > doc = nlp("A phrase with another phrase occurs.")
 > span = doc[3:5]
 > chunks = list(span.noun_chunks)
 > assert len(chunks) == 1
 > assert chunks[0].text == "another phrase"
 > ```
 | Name       | Description                       |
 | ---------- | --------------------------------- |
 | **YIELDS** | Noun chunks in the span. ~~Span~~ |
 ## Span.as_doc {#as_doc tag="method"}
 Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -191,6 +191,15 @@ the morph to an unset state.
 | -------- | --------------------------------------------------------------------------------- |
 | features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
 ## Token.has_morph {#has_morph tag="method"}
 Check whether the token has annotated morph information. Return `False` when the
 morph annotation is unset/missing.
 | Name        | Description                                   |
 | ----------- | --------------------------------------------- |
 | **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
 ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
 Check whether this token is a parent, grandparent, etc. of another in the
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also
 provide a schedule, allowing you to freeze the shared parameters at the start of
 training.
 ### Managing transformer model max length limitations {#transformer-max-length}
 Many transformer models have a limit on the maximum number of tokens that the
 model can process, for example BERT models are limited to 512 tokens. This limit
 refers to the number of transformer tokens (BPE, WordPiece, etc.), not the
 number of spaCy tokens.
 To be able to process longer texts, the spaCy [`transformer`](/api/transformer)
 component uses [`span_getters`](/api/transformer#span_getters) to convert a
 batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A
 span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or
 a window of spaCy tokens (`strided_spans`). If a single span corresponds to more
 transformer tokens than the transformer model supports, the spaCy pipeline can't
 process the text because some spaCy tokens would be left without an analysis.
 In general, it is up to the transformer pipeline user to manage the input texts
 so that the model max length is not exceeded. If you're training a **new
 pipeline**, you have a number of options to handle the max length limit:
 - Use `doc_spans` with short texts only
 - Use `sent_spans` with short sentences only
 - For `strided_spans`, lower the `window` size to be short enough for your input
  texts (and don't forget to lower the `stride` correspondingly)
 - Implement a [custom span getter](#transformers-training-custom-settings)
 You may still run into the max length limit if a single spaCy token is very
 long, like a long URL or a noisy string, or if you're using a **pretrained
 pipeline** like `en_core_web_trf` with a fixed `window` size for
 `strided_spans`. In this case, you need to modify either your texts or your
 pipeline so that you have shorter spaCy tokens. Some options:
 - Preprocess your texts to clean up noise and split long tokens with whitespace
 - Add a `token_splitter` to the beginning of your pipeline to break up
  tokens that are longer than a specified length:
  ```python
  config={"min_length": 20, "split_length": 5}
  nlp.add_pipe("token_splitter", config=config, first=True)
  ```
  In this example, tokens that are at least 20 characters long will be split up
  into smaller tokens of 5 characters each, resulting in strided spans that
  correspond to fewer transformer tokens.
 ## Static vectors {#static-vectors}
 If your pipeline includes a **word vectors table**, you'll be able to use the
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" – flat phrases that have a noun as their
 head. You can think of noun chunks as a noun plus the words describing the noun
 – for example, "the lavish green grass" or "the world’s largest tech fund". To
 get the noun chunks in a document, simply iterate over
-[`Doc.noun_chunks`](/api/doc#noun_chunks)
+[`Doc.noun_chunks`](/api/doc#noun_chunks).
 ```python
 ### {executable="true"}
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -2139,7 +2139,7 @@
                "from negspacy.negation import Negex",
                "",
                "nlp = spacy.load(\"en_core_web_sm\")",
-                "negex = Negex(nlp, ent_types=[\"PERSON\",\"ORG\"])",
+                "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])",
                "nlp.add_pipe(negex, last=True)",
                "",
                "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")",
--- a/website/package-lock.json
+++ b/website/package-lock.json
--- a/website/package.json
+++ b/website/package.json
@ -3,7 +3,7 @@
    "private": true,
    "description": "spaCy website",
    "version": "3.0.0",
-    "author": "Explosion AI <contact@explosion.ai>",
+    "author": "Explosion <contact@explosion.ai>",
    "license": "MIT",
    "dependencies": {
        "@jupyterlab/outputarea": "^0.19.1",
@ -16,7 +16,7 @@
        "autoprefixer": "^9.4.7",
        "classnames": "^2.2.6",
        "codemirror": "^5.43.0",
-        "gatsby": "^2.1.18",
+        "gatsby": "^2.11.1",
        "gatsby-image": "^2.0.29",
        "gatsby-mdx": "^0.3.6",
        "gatsby-plugin-catch-links": "^2.0.11",
@ -24,12 +24,14 @@
        "gatsby-plugin-offline": "^2.0.24",
        "gatsby-plugin-plausible": "0.0.6",
        "gatsby-plugin-react-helmet": "^3.0.6",
-        "gatsby-plugin-react-svg": "^2.1.2",
+        "gatsby-plugin-react-svg": "^2.0.0",
        "gatsby-plugin-robots-txt": "^1.5.1",
        "gatsby-plugin-sass": "^2.0.10",
        "gatsby-plugin-sharp": "^2.0.20",
        "gatsby-plugin-sitemap": "^2.0.5",
        "gatsby-plugin-svgr": "^2.0.1",
        "gatsby-remark-copy-linked-files": "^2.0.9",
        "gatsby-remark-find-replace": "^0.3.0",
        "gatsby-remark-images": "^3.0.4",
        "gatsby-remark-prismjs": "^3.2.4",
        "gatsby-remark-smartypants": "^2.0.8",
@ -39,9 +41,11 @@
        "gatsby-transformer-sharp": "^2.1.13",
        "html-to-react": "^1.3.4",
        "intersection-observer": "^0.5.1",
        "jinja-to-js": "^3.2.3",
        "node-sass": "^4.11.0",
        "parse-numeric-range": "0.0.2",
        "prismjs": "^1.15.0",
        "prismjs-bibtex": "^1.1.0",
        "prop-types": "^15.7.2",
        "react": "^16.8.2",
        "react-dom": "^16.8.2",
@ -50,19 +54,22 @@
        "remark-react": "^5.0.1"
    },
    "scripts": {
-        "build": "gatsby build",
+        "build": "npm run python:install && npm run python:setup && gatsby build",
-        "dev": "gatsby develop",
+        "dev": "npm run python:setup && gatsby develop",
        "dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
        "lint": "eslint **",
        "clear": "rm -rf .cache",
-        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
+        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
        "python:install": "pip install -r setup/requirements.txt",
        "python:setup": "cd setup && sh setup.sh"
    },
    "devDependencies": {
        "@sindresorhus/slugify": "^0.8.0",
        "browser-monads": "^1.0.0",
        "md-attr-parser": "^1.2.1",
        "prettier": "^1.16.4",
        "raw-loader": "^1.0.0",
-        "unist-util-visit": "^1.4.0",
+        "unist-util-visit": "^1.4.0"
        "@sindresorhus/slugify": "^0.8.0"
    },
    "repository": {
        "type": "git",
--- a/website/src/components/footer.js
+++ b/website/src/components/footer.js
@ -6,7 +6,7 @@ import classNames from 'classnames'
 import Link from './link'
 import Grid from './grid'
 import Newsletter from './newsletter'
-import ExplosionLogo from '-!svg-react-loader!../images/explosion.svg'
+import { ReactComponent as ExplosionLogo } from '../images/explosion.svg'
 import classes from '../styles/footer.module.sass'
 export default function Footer({ wide = false }) {
--- a/website/src/components/icon.js
+++ b/website/src/components/icon.js
@ -1,4 +1,4 @@
-import React from 'react'
+import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import classNames from 'classnames'
@ -19,7 +19,13 @@ import NoIcon from '-!svg-react-loader!../images/icons/no.svg'
 import NeutralIcon from '-!svg-react-loader!../images/icons/neutral.svg'
 import OfflineIcon from '-!svg-react-loader!../images/icons/offline.svg'
 import SearchIcon from '-!svg-react-loader!../images/icons/search.svg'
 import MoonIcon from '-!svg-react-loader!../images/icons/moon.svg'
 import ClipboardIcon from '-!svg-react-loader!../images/icons/clipboard.svg'
 import NetworkIcon from '-!svg-react-loader!../images/icons/network.svg'
 import DownloadIcon from '-!svg-react-loader!../images/icons/download.svg'
 import PackageIcon from '-!svg-react-loader!../images/icons/package.svg'
 import { isString } from './util'
 import classes from '../styles/icon.module.sass'
 const icons = {
@ -41,9 +47,22 @@ const icons = {
    neutral: NeutralIcon,
    offline: OfflineIcon,
    search: SearchIcon,
    moon: MoonIcon,
    clipboard: ClipboardIcon,
    network: NetworkIcon,
    download: DownloadIcon,
    package: PackageIcon,
 }
-const Icon = ({ name, width, height, inline, variant, className }) => {
+export default function Icon({
    name,
    width = 20,
    height,
    inline = false,
    variant,
    className,
    ...props
 }) {
    const IconComponent = icons[name]
    const iconClassNames = classNames(classes.root, className, {
        [classes.inline]: inline,
@ -57,15 +76,11 @@ const Icon = ({ name, width, height, inline, variant, className }) => {
            aria-hidden="true"
            width={width}
            height={height || width}
            {...props}
        />
    )
 }
 Icon.defaultProps = {
    width: 20,
    inline: false,
 }
 Icon.propTypes = {
    name: PropTypes.oneOf(Object.keys(icons)),
    width: PropTypes.number,
@ -75,4 +90,43 @@ Icon.propTypes = {
    className: PropTypes.string,
 }
-export default Icon
+export function replaceEmoji(cellChildren) {
    const icons = {
        '✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' },
        '❌': { name: 'no', variant: 'error', 'aria-label': 'negative' },
    }
    const iconRe = new RegExp(`^(${Object.keys(icons).join('|')})`, 'g')
    let children = isString(cellChildren) ? [cellChildren] : cellChildren
    let hasIcon = false
    if (Array.isArray(children)) {
        children = children.map((child, i) => {
            if (isString(child)) {
                const icon = icons[child.trim()]
                if (icon) {
                    hasIcon = true
                    return (
                        <Icon
                            {...icon}
                            inline={i < children.length}
                            aria-hidden={undefined}
                            key={i}
                        />
                    )
                } else if (iconRe.test(child)) {
                    hasIcon = true
                    const [, iconName, text] = child.split(iconRe)
                    return (
                        <Fragment key={i}>
                            <Icon {...icons[iconName]} aria-hidden={undefined} inline={true} />
                            {text.replace(/^\s+/g, '')}
                        </Fragment>
                    )
                }
                // Work around prettier auto-escape
                if (child.startsWith('\\')) return child.slice(1)
            }
            return child
        })
    }
    return { content: children, hasIcon }
 }
--- a/website/src/components/navigation.js
+++ b/website/src/components/navigation.js
@ -6,7 +6,7 @@ import Link from './link'
 import Icon from './icon'
 import Dropdown from './dropdown'
 import { github } from './util'
-import Logo from '-!svg-react-loader!../images/logo.svg'
+import { ReactComponent as Logo } from '../images/logo.svg'
 import classes from '../styles/navigation.module.sass'
 const NavigationDropdown = ({ items = [], section }) => {
--- a/website/src/images/logos/index.js
+++ b/website/src/images/logos/index.js
@ -1,31 +0,0 @@
 import AirbnbLogo from '-!svg-react-loader!./airbnb.svg'
 import UberLogo from '-!svg-react-loader!./uber.svg'
 import QuoraLogo from '-!svg-react-loader!./quora.svg'
 import RetrieverLogo from '-!svg-react-loader!./retriever.svg'
 import StitchfixLogo from '-!svg-react-loader!./stitchfix.svg'
 import ChartbeatLogo from '-!svg-react-loader!./chartbeat.svg'
 import AllenAILogo from '-!svg-react-loader!./allenai.svg'
 import RecodeLogo from '-!svg-react-loader!./recode.svg'
 import WapoLogo from '-!svg-react-loader!./wapo.svg'
 import BBCLogo from '-!svg-react-loader!./bbc.svg'
 import MicrosoftLogo from '-!svg-react-loader!./microsoft.svg'
 import VenturebeatLogo from '-!svg-react-loader!./venturebeat.svg'
 import ThoughtworksLogo from '-!svg-react-loader!./thoughtworks.svg'
 export default {
    airbnb: AirbnbLogo,
    uber: UberLogo,
    quora: QuoraLogo,
    retriever: RetrieverLogo,
    stitchfix: StitchfixLogo,
    chartbeat: ChartbeatLogo,
    allenai: AllenAILogo,
    recode: RecodeLogo,
    wapo: WapoLogo,
    bbc: BBCLogo,
    microsoft: MicrosoftLogo,
    venturebeat: VenturebeatLogo,
    thoughtworks: ThoughtworksLogo,
 }
--- a/website/src/widgets/styleguide.js
+++ b/website/src/widgets/styleguide.js
@ -4,7 +4,7 @@ import Grid from '../components/grid'
 import { Label } from '../components/typography'
 import Link from '../components/link'
-import Logo from '-!svg-react-loader!../images/logo.svg'
+import { ReactComponent as Logo } from '../images/logo.svg'
 import patternBlue from '../images/pattern_blue.jpg'
 import patternGreen from '../images/pattern_green.jpg'
 import patternPurple from '../images/pattern_purple.jpg'