Merge branch 'develop' into feature/spacy-legacy

2025-09-19 02:22:43 +03:00 · 2021-01-18 11:43:39 +11:00 · 2021-01-18 11:43:39 +11:00 · 1090d3d675
commit 1090d3d675
parent a552db2819 09cacbb7ee
30 changed files with 29035 additions and 28725 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -463,12 +463,14 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")

    # TODO: fix numbering after merging develop into master
+    E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
    E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
            "If you're using a custom function, make sure the code is available. "
            "If the function is provided by a third-party package, e.g. "
            "spacy-transformers, make sure the package is installed in your "
            "environment.\n\nAvailable names: {available}")
-    E894 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
+    E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
+            "'{lang}'.")
    E895 = ("The 'textcat' component received gold-standard annotations with "
            "multiple labels per document. In spaCy 3 you should use the "
            "'textcat_multilabel' component for this instead. "
--- a/spacy/lang/he/lex_attrs.py
+++ b/spacy/lang/he/lex_attrs.py
@ -86,7 +86,7 @@ def like_num(text):
    if text in _num_words:
        return True

-    # CHeck ordinal number
+    # Check ordinal number
    if text in _ordinal_words:
        return True
    return False
--- a/spacy/lang/mk/lemmatizer.py
+++ b/spacy/lang/mk/lemmatizer.py
@ -18,8 +18,6 @@ class MacedonianLemmatizer(Lemmatizer):
            string = string[:-3]
            univ_pos = "verb"

-        if callable(self.is_base_form) and self.is_base_form(univ_pos, morphology):
-            return [string.lower()]
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
--- a/spacy/language.py
+++ b/spacy/language.py
@ -697,6 +697,8 @@ class Language:
        source_config = source.config.interpolate()
        pipe_config = util.copy_config(source_config["components"][source_name])
        self._pipe_configs[name] = pipe_config
+        for s in source.vocab.strings:
+            self.vocab.strings.add(s)
        return pipe, pipe_config["factory"]

    def add_pipe(
@ -1619,9 +1621,7 @@ class Language:
                    if model not in source_nlps:
                        # We only need the components here and we need to init
                        # model with the same vocab as the current nlp object
-                        source_nlps[model] = util.load_model(
-                            model, vocab=nlp.vocab, disable=["vocab", "tokenizer"]
-                        )
+                        source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
                    source_name = pipe_cfg.get("component", pipe_name)
                    nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
        disabled_pipes = [*config["nlp"]["disabled"], *disable]
--- a/spacy/pipeline/_parser_internals/arc_eager.pyx
+++ b/spacy/pipeline/_parser_internals/arc_eager.pyx
@ -197,13 +197,39 @@ cdef class ArcEagerGold:
        self.mem = Pool()
        heads, labels = example.get_aligned_parse(projectivize=True)
        labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels]
-        sent_starts = example.get_aligned_sent_starts()
+        sent_starts = _get_aligned_sent_starts(example)
        assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts))
        self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts)

    def update(self, StateClass stcls):
        update_gold_state(&self.c, stcls.c)

+def _get_aligned_sent_starts(example):
+    """Get list of SENT_START attributes aligned to the predicted tokenization.
+    If the reference has not sentence starts, return a list of None values.
+
+    This function is slightly different from the one on Example, because we also
+    check whether the reference sentences align across multiple sentences,
+    and return missing values if they do. This prevents a problem where you have
+    the start of a sentence merged onto a token that belongs to two sentences.
+    """
+    if example.y.has_annotation("SENT_START"):
+        align = example.alignment.y2x
+        sent_starts = [False] * len(example.x)
+        seen_words = set()
+        for y_sent in example.y.sents:
+            x_indices = list(align[y_sent.start : y_sent.end].dataXd)
+            if any(x_idx in seen_words for x_idx in x_indices):
+                # If there are any tokens in X that align across two sentences,
+                # regard the sentence annotations as missing, as we can't
+                # reliably use them.
+                return [None] * len(example.x)
+            seen_words.update(x_indices)
+            sent_starts[x_indices[0]] = True
+        return sent_starts
+    else:
+        return [None] * len(example.x)
+

 cdef int check_state_gold(char state_bits, char flag) nogil:
    cdef char one = 1
@ -820,7 +846,7 @@ cdef class ArcEager(TransitionSystem):
            else:
                failed = False
                break
-        if failed:
+        if failed and _debug not in (False, None):
            example = _debug
            print("Actions")
            for i in range(self.n_moves):
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -1,7 +1,11 @@
+import srsly
+from thinc.api import Config
+from typing import Dict, Any
 from ..language import Language
 from ..matcher import Matcher
 from ..tokens import Doc
 from ..util import filter_spans
+from .. import util


@Language.component(
@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
        for span in spans:
            retokenizer.merge(span)
    return doc
+
+
+@Language.factory(
+    "token_splitter",
+    default_config={"min_length": 25, "split_length": 10},
+    retokenizes=True,
+)
+def make_token_splitter(
+    nlp: Language,
+    name: str,
+    *,
+    min_length=0,
+    split_length=0,
+):
+    return TokenSplitter(
+        min_length=min_length, split_length=split_length
+    )
+
+
+class TokenSplitter:
+    def __init__(self, min_length: int = 0, split_length: int = 0):
+        self.min_length = min_length
+        self.split_length = split_length
+
+    def __call__(self, doc: Doc) -> Doc:
+        if self.min_length > 0 and self.split_length > 0:
+            with doc.retokenize() as retokenizer:
+                for t in doc:
+                    if len(t.text) >= self.min_length:
+                        orths = []
+                        heads = []
+                        attrs = {}
+                        for i in range(0, len(t.text), self.split_length):
+                            orths.append(t.text[i : i + self.split_length])
+                            heads.append((t, i / self.split_length))
+                        retokenizer.split(t, orths, heads, attrs)
+        return doc
+
+    def _get_config(self) -> Dict[str, Any]:
+        return {
+            "min_length": self.min_length,
+            "split_length": self.split_length,
+        }
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.min_length = config.get("min_length", 0)
+        self.split_length = config.get("split_length", 0)
+
+    def to_bytes(self, **kwargs):
+        serializers = {
+            "cfg": lambda: srsly.json_dumps(self._get_config()),
+        }
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data, **kwargs):
+        deserializers = {
+            "cfg": lambda b: self._set_config(srsly.json_loads(b)),
+        }
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path, **kwargs):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: srsly.write_json(p, self._get_config()),
+        }
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path, **kwargs):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: self._set_config(srsly.read_json(p)),
+        }
+        util.from_disk(path, serializers, [])
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -145,6 +145,10 @@ class Morphologizer(Tagger):
            for example in get_examples():
                for i, token in enumerate(example.reference):
                    pos = token.pos_
+                    # if both are unset, annotation is missing, so do not add
+                    # an empty label
+                    if pos == "" and not token.has_morph():
+                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
                    morph_dict = Morphology.feats_to_dict(morph)
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
                    if norm_label not in self.cfg["labels_morph"]:
                        self.cfg["labels_morph"][norm_label] = morph
                        self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
-        if len(self.labels) <= 1:
+        if len(self.labels) < 1:
            raise ValueError(Errors.E143.format(name=self.name))
        doc_sample = []
        label_sample = []
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
                pos = pos_tags[i]
                morph = morphs[i]
                # POS may align (same value for multiple tokens) when morph
-                # doesn't, so if either is None, treat both as None here so that
-                # truths doesn't end up with an unknown morph+POS combination
+                # doesn't, so if either is misaligned (None), treat the
+                # annotation as missing so that truths doesn't end up with an
+                # unknown morph+POS combination
                if pos is None or morph is None:
                    label = None
+                # If both are unset, the annotation is missing (empty morph
+                # converted from int is "_" rather than "")
+                elif pos == "" and morph == "":
+                    label = None
+                # Otherwise, generate the combined label
                else:
                    label_dict = Morphology.feats_to_dict(morph)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                    # As a fail-safe, skip any unrecognized labels
+                    if label not in self.labels:
+                        label = None
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,6 +2,8 @@ import pytest
 import numpy
 import logging
 import mock
+
+from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 from spacy.lexeme import Lexeme
@ -633,6 +635,14 @@ def test_doc_set_ents_invalid_spans(en_tokenizer):
        doc.ents = spans


+def test_doc_noun_chunks_not_implemented():
+    """Test that a language without noun_chunk iterator, throws a NotImplementedError"""
+    text = "Může data vytvářet a spravovat, ale především je dokáže analyzovat, najít v nich nové vztahy a vše přehledně vizualizovat."
+    nlp = MultiLanguage()
+    doc = nlp(text)
+    with pytest.raises(NotImplementedError):
+        chunks = list(doc.noun_chunks)
+
 def test_span_groups(en_tokenizer):
    doc = en_tokenizer("Some text about Colombia and the Czech Republic")
    doc.spans["hi"] = [Span(doc, 3, 4, label="bye")]
--- a/spacy/tests/lang/en/test_noun_chunks.py
+++ b/spacy/tests/lang/en/test_noun_chunks.py
@ -1,11 +1,16 @@
-import numpy
-from spacy.attrs import HEAD, DEP
-from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root
-from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.tokens import Doc
 import pytest


+@pytest.fixture
+def doc(en_vocab):
+    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
+    heads = [1, 1, 6, 6, 3, 3, 1]
+    deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
+    pos = ["PROPN", "VERB", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"]
+    return Doc(en_vocab, words=words, heads=heads, deps=deps, pos=pos)
+
+
 def test_noun_chunks_is_parsed(en_tokenizer):
    """Test that noun_chunks raises Value Error for 'en' language if Doc is not parsed."""
    doc = en_tokenizer("This is a sentence")
@ -13,31 +18,27 @@ def test_noun_chunks_is_parsed(en_tokenizer):
        list(doc.noun_chunks)


-def test_en_noun_chunks_not_nested(en_vocab):
-    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
-    heads = [1, 1, 6, 6, 3, 3, 1]
-    deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
-    doc = Doc(en_vocab, words=words, heads=heads, deps=deps)
-    doc.from_array(
-        [HEAD, DEP],
-        numpy.asarray(
-            [
-                [1, nsubj],
-                [0, root],
-                [4, amod],
-                [3, nmod],
-                [-1, cc],
-                [-2, conj],
-                [-5, dobj],
-            ],
-            dtype="uint64",
-        ),
-    )
-    doc.noun_chunks_iterator = noun_chunks
+def test_en_noun_chunks_not_nested(doc, en_vocab):
+    """Test that each token only appears in one noun chunk at most"""
    word_occurred = {}
-    for chunk in doc.noun_chunks:
+    chunks = list(doc.noun_chunks)
+    assert len(chunks) > 1
+    for chunk in chunks:
        for word in chunk:
            word_occurred.setdefault(word.text, 0)
            word_occurred[word.text] += 1
+    assert len(word_occurred) > 0
    for word, freq in word_occurred.items():
        assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
+
+
+def test_noun_chunks_span(doc, en_tokenizer):
+    """Test that the span.noun_chunks property works correctly"""
+    doc_chunks = list(doc.noun_chunks)
+    span = doc[0:3]
+    span_chunks = list(span.noun_chunks)
+    assert 0 < len(span_chunks) < len(doc_chunks)
+    for chunk in span_chunks:
+        assert chunk in doc_chunks
+        assert chunk.start >= 0
+        assert chunk.end <= 3
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2):
    assert len(doc2) == 6
    assert len(list(doc2.ents)) == 1
    assert doc2[2].text == "New York"
+
+
+def test_token_splitter():
+    nlp = Language()
+    config = {"min_length": 20, "split_length": 5}
+    token_splitter = nlp.add_pipe("token_splitter", config=config)
+    doc = nlp("aaaaabbbbbcccccdddd e f g")
+    assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"]
+    doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i")
+    assert [t.text for t in doc] == [
+        "aaaaa",
+        "bbbbb",
+        "ccccc",
+        "ddddd",
+        "eeeee",
+        "ff",
+        "g",
+        "h",
+        "i",
+    ]
+    assert all(len(t.text) <= token_splitter.split_length for t in doc)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -136,3 +136,28 @@ def test_overfitting_IO():
    gold_pos_tags = ["", "", "", ""]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
+
+    # Test with unset morph and partial POS
+    nlp.remove_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
+    for example in train_examples:
+        for token in example.reference:
+            if token.text == "ham":
+                token.pos_ = "NOUN"
+            else:
+                token.pos_ = ""
+            token.set_morph(None)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    print(nlp.get_pipe("morphologizer").labels)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["morphologizer"] < 0.00001
+
+    # Test the trained model
+    test_text = "I like blue ham"
+    doc = nlp(test_text)
+    gold_morphs = ["", "", "", ""]
+    gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
+    assert [str(t.morph) for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@ -81,7 +81,8 @@ def test_issue3199():
    """
    words = ["This", "is", "a", "sentence"]
    doc = Doc(Vocab(), words=words, heads=[0] * len(words), deps=["dep"] * len(words))
-    assert list(doc[0:3].noun_chunks) == []
+    with pytest.raises(NotImplementedError):
+        list(doc[0:3].noun_chunks)


 def test_issue3209():
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -816,8 +816,10 @@ cdef class Doc:
    @property
    def noun_chunks(self):
        """Iterate over the base noun phrases in the document. Yields base
-        noun-phrase #[code Span] objects, if the document has been
-        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
+        Raises a NotImplementedError otherwise.
+
+        A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.
@ -826,16 +828,17 @@ cdef class Doc:

        DOCS: https://nightly.spacy.io/api/doc#noun_chunks
        """
+        if self.noun_chunks_iterator is None:
+            raise NotImplementedError(Errors.E894.format(lang=self.vocab.lang))

        # Accumulate the result before beginning to iterate over it. This
-        # prevents the tokenisation from being changed out from under us
+        # prevents the tokenization from being changed out from under us
        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenisation changing, so it's okay once we have the Span
+        # its tokenization changing, so it's okay once we have the Span
        # objects. See Issue #375.
        spans = []
-        if self.noun_chunks_iterator is not None:
-            for start, end, label in self.noun_chunks_iterator(self):
-                spans.append(Span(self, start, end, label=label))
+        for start, end, label in self.noun_chunks_iterator(self):
+            spans.append(Span(self, start, end, label=label))
        for span in spans:
            yield span

--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -487,30 +487,25 @@ cdef class Span:
        """
        return "".join([t.text_with_ws for t in self])

+
    @property
    def noun_chunks(self):
-        """Yields base noun-phrase `Span` objects, if the document has been
-        syntactically parsed. A base noun phrase, or "NP chunk", is a noun
+        """Iterate over the base noun phrases in the span. Yields base
+        noun-phrase #[code Span] objects, if the language has a noun chunk iterator.
+        Raises a NotImplementedError otherwise.
+
+        A base noun phrase, or "NP chunk", is a noun
        phrase that does not permit other NPs to be nested within it – so no
        NP-level coordination, no prepositional phrases, and no relative
        clauses.

-        YIELDS (Span): Base noun-phrase `Span` objects.
+        YIELDS (Span): Noun chunks in the span.

        DOCS: https://nightly.spacy.io/api/span#noun_chunks
        """
-        # Accumulate the result before beginning to iterate over it. This
-        # prevents the tokenisation from being changed out from under us
-        # during the iteration. The tricky thing here is that Span accepts
-        # its tokenisation changing, so it's okay once we have the Span
-        # objects. See Issue #375
-        spans = []
-        cdef attr_t label
-        if self.doc.noun_chunks_iterator is not None:
-            for start, end, label in self.doc.noun_chunks_iterator(self):
-                spans.append(Span(self.doc, start, end, label=label))
-        for span in spans:
-            yield span
+        for span in self.doc.noun_chunks:
+            if span.start >= self.start and span.end <= self.end:
+                yield span

    @property
    def root(self):
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -211,6 +211,14 @@ cdef class Token:
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))

+    def has_morph(self):
+        """Check whether the token has annotated morph information.
+        Return False when the morph annotation is unset/missing.
+
+        RETURNS (bool): Whether the morph annotation is set.
+        """
+        return not self.c.morph == 0
+
    property morph:
        def __get__(self):
            return MorphAnalysis.from_id(self.vocab, self.c.morph)
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -200,10 +200,6 @@ cdef class Example:
    def get_aligned_sent_starts(self):
        """Get list of SENT_START attributes aligned to the predicted tokenization.
        If the reference has not sentence starts, return a list of None values.
-
-        The aligned sentence starts use the get_aligned_spans method, rather
-        than aligning the list of tags, so that it handles cases where a mistaken
-        tokenization starts the sentence.
        """
        if self.y.has_annotation("SENT_START"):
            align = self.alignment.y2x
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -616,11 +616,15 @@ phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be
 nested within it – so no NP-level coordination, no prepositional phrases, and no
 relative clauses.

+If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
+not been implemeted for the given language, a `NotImplementedError` is raised.
+
 > #### Example
 >
 > ```python
 > doc = nlp("A phrase with another phrase occurs.")
 > chunks = list(doc.noun_chunks)
+> assert len(chunks) == 2
 > assert chunks[0].text == "A phrase"
 > assert chunks[1].text == "another phrase"
 > ```
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@ -6,6 +6,7 @@ menu:
  - ['merge_noun_chunks', 'merge_noun_chunks']
  - ['merge_entities', 'merge_entities']
  - ['merge_subtokens', 'merge_subtokens']
+  - ['token_splitter', 'token_splitter']
 ---

 ## merge_noun_chunks {#merge_noun_chunks tag="function"}
@ -107,3 +108,25 @@ end of the pipeline and after all other components.
 | `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
 | `label`     | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~       |
 | **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~                    |
+
+## token_splitter {#token_splitter tag="function" new="3.0"}
+
+Split tokens longer than a minimum length into shorter tokens. Intended for use
+with transformer pipelines where long spaCy tokens lead to input text that
+exceed the transformer model max length. See
+[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length).
+
+> #### Example
+>
+> ```python
+> config={"min_length": 20, "split_length": 5}
+> nlp.add_pipe("token_splitter", config=config, first=True)
+> doc = nlp("aaaaabbbbbcccccdddddee")
+> print([token.text for token in doc])
+> # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee']
+> ```
+
+| Setting        | Description                                                           |
+| -------------- | --------------------------------------------------------------------- |
+| `min_length`   | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
+| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~              |
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
 | Name                                 | Description                                                                               |
 | ------------------------------------ | ----------------------------------------------------------------------------------------- |
 | `start`                              | The index of the first character of the span. ~~int~~                                     |
-| `end`                                | The index of the last character after the span. ~~int~~                                    |
+| `end`                                | The index of the last character after the span. ~~int~~                                   |
 | `label`                              | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               |
 | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
 | `vector`                             | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            |
@ -274,6 +274,31 @@ if the entity recognizer has been applied.
 | ----------- | ----------------------------------------------------------------- |
 | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |

+## Span.noun_chunks {#noun_chunks tag="property" model="parser"}
+
+Iterate over the base noun phrases in the span. Yields base noun-phrase `Span`
+objects, if the document has been syntactically parsed. A base noun phrase, or
+"NP chunk", is a noun phrase that does not permit other NPs to be nested within
+it – so no NP-level coordination, no prepositional phrases, and no relative
+clauses.
+
+If the `noun_chunk` [syntax iterator](/usage/adding-languages#language-data) has
+not been implemeted for the given language, a `NotImplementedError` is raised.
+
+> #### Example
+>
+> ```python
+> doc = nlp("A phrase with another phrase occurs.")
+> span = doc[3:5]
+> chunks = list(span.noun_chunks)
+> assert len(chunks) == 1
+> assert chunks[0].text == "another phrase"
+> ```
+
+| Name       | Description                       |
+| ---------- | --------------------------------- |
+| **YIELDS** | Noun chunks in the span. ~~Span~~ |
+
 ## Span.as_doc {#as_doc tag="method"}

 Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -191,6 +191,15 @@ the morph to an unset state.
 | -------- | --------------------------------------------------------------------------------- |
 | features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |

+## Token.has_morph {#has_morph tag="method"}
+
+Check whether the token has annotated morph information. Return `False` when the
+morph annotation is unset/missing.
+
+| Name        | Description                                   |
+| ----------- | --------------------------------------------- |
+| **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
+
 ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}

 Check whether this token is a parent, grandparent, etc. of another in the
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also
 provide a schedule, allowing you to freeze the shared parameters at the start of
 training.

+### Managing transformer model max length limitations {#transformer-max-length}
+
+Many transformer models have a limit on the maximum number of tokens that the
+model can process, for example BERT models are limited to 512 tokens. This limit
+refers to the number of transformer tokens (BPE, WordPiece, etc.), not the
+number of spaCy tokens.
+
+To be able to process longer texts, the spaCy [`transformer`](/api/transformer)
+component uses [`span_getters`](/api/transformer#span_getters) to convert a
+batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A
+span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or
+a window of spaCy tokens (`strided_spans`). If a single span corresponds to more
+transformer tokens than the transformer model supports, the spaCy pipeline can't
+process the text because some spaCy tokens would be left without an analysis.
+
+In general, it is up to the transformer pipeline user to manage the input texts
+so that the model max length is not exceeded. If you're training a **new
+pipeline**, you have a number of options to handle the max length limit:
+
+- Use `doc_spans` with short texts only
+- Use `sent_spans` with short sentences only
+- For `strided_spans`, lower the `window` size to be short enough for your input
+  texts (and don't forget to lower the `stride` correspondingly)
+- Implement a [custom span getter](#transformers-training-custom-settings)
+
+You may still run into the max length limit if a single spaCy token is very
+long, like a long URL or a noisy string, or if you're using a **pretrained
+pipeline** like `en_core_web_trf` with a fixed `window` size for
+`strided_spans`. In this case, you need to modify either your texts or your
+pipeline so that you have shorter spaCy tokens. Some options:
+
+- Preprocess your texts to clean up noise and split long tokens with whitespace
+- Add a `token_splitter` to the beginning of your pipeline to break up
+  tokens that are longer than a specified length:
+
+  ```python
+  config={"min_length": 20, "split_length": 5}
+  nlp.add_pipe("token_splitter", config=config, first=True)
+  ```
+
+  In this example, tokens that are at least 20 characters long will be split up
+  into smaller tokens of 5 characters each, resulting in strided spans that
+  correspond to fewer transformer tokens.
+
 ## Static vectors {#static-vectors}

 If your pipeline includes a **word vectors table**, you'll be able to use the
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -221,7 +221,7 @@ Noun chunks are "base noun phrases" – flat phrases that have a noun as their
 head. You can think of noun chunks as a noun plus the words describing the noun
 – for example, "the lavish green grass" or "the world’s largest tech fund". To
 get the noun chunks in a document, simply iterate over
-[`Doc.noun_chunks`](/api/doc#noun_chunks)
+[`Doc.noun_chunks`](/api/doc#noun_chunks).

 ```python
 ### {executable="true"}
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1,6 +1,6 @@
 {
    "resources": [
-        {
+    	{
            "id": "spacy-textblob",
            "title": "spaCyTextBlob",
            "slogan": "Easy sentiment analysis for spaCy using TextBlob",
@ -30,7 +30,7 @@
            },
            "category": ["pipeline"],
            "tags": ["sentiment", "textblob"]
-        },
+	    },
        {
            "id": "spacy-ray",
            "title": "spacy-ray",
@ -2139,7 +2139,7 @@
                "from negspacy.negation import Negex",
                "",
                "nlp = spacy.load(\"en_core_web_sm\")",
-                "negex = Negex(nlp, ent_types=[\"PERSON\",\"ORG\"])",
+                "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])",
                "nlp.add_pipe(negex, last=True)",
                "",
                "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")",
@ -2619,14 +2619,14 @@
                "github": "medspacy"
            }
        },
-          {
+	      {
            "id": "rita-dsl",
            "title": "RITA DSL",
            "slogan": "Domain Specific Language for creating language rules",
            "github": "zaibacu/rita-dsl",
            "description": "A Domain Specific Language (DSL) for building language patterns. These can be later compiled into spaCy patterns, pure regex, or any other format",
            "pip": "rita-dsl",
-              "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png",
+	          "thumb": "https://raw.githubusercontent.com/zaibacu/rita-dsl/master/docs/assets/logo-100px.png",
            "code_language": "python",
            "code_example": [
                "import spacy",
--- a/website/package-lock.json
+++ b/website/package-lock.json
--- a/website/package.json
+++ b/website/package.json
@ -3,7 +3,7 @@
    "private": true,
    "description": "spaCy website",
    "version": "3.0.0",
-    "author": "Explosion AI <contact@explosion.ai>",
+    "author": "Explosion <contact@explosion.ai>",
    "license": "MIT",
    "dependencies": {
        "@jupyterlab/outputarea": "^0.19.1",
@ -16,7 +16,7 @@
        "autoprefixer": "^9.4.7",
        "classnames": "^2.2.6",
        "codemirror": "^5.43.0",
-        "gatsby": "^2.1.18",
+        "gatsby": "^2.11.1",
        "gatsby-image": "^2.0.29",
        "gatsby-mdx": "^0.3.6",
        "gatsby-plugin-catch-links": "^2.0.11",
@ -24,12 +24,14 @@
        "gatsby-plugin-offline": "^2.0.24",
        "gatsby-plugin-plausible": "0.0.6",
        "gatsby-plugin-react-helmet": "^3.0.6",
-        "gatsby-plugin-react-svg": "^2.1.2",
+        "gatsby-plugin-react-svg": "^2.0.0",
+        "gatsby-plugin-robots-txt": "^1.5.1",
        "gatsby-plugin-sass": "^2.0.10",
        "gatsby-plugin-sharp": "^2.0.20",
        "gatsby-plugin-sitemap": "^2.0.5",
        "gatsby-plugin-svgr": "^2.0.1",
        "gatsby-remark-copy-linked-files": "^2.0.9",
+        "gatsby-remark-find-replace": "^0.3.0",
        "gatsby-remark-images": "^3.0.4",
        "gatsby-remark-prismjs": "^3.2.4",
        "gatsby-remark-smartypants": "^2.0.8",
@ -39,9 +41,11 @@
        "gatsby-transformer-sharp": "^2.1.13",
        "html-to-react": "^1.3.4",
        "intersection-observer": "^0.5.1",
+        "jinja-to-js": "^3.2.3",
        "node-sass": "^4.11.0",
        "parse-numeric-range": "0.0.2",
        "prismjs": "^1.15.0",
+        "prismjs-bibtex": "^1.1.0",
        "prop-types": "^15.7.2",
        "react": "^16.8.2",
        "react-dom": "^16.8.2",
@ -50,19 +54,22 @@
        "remark-react": "^5.0.1"
    },
    "scripts": {
-        "build": "gatsby build",
-        "dev": "gatsby develop",
+        "build": "npm run python:install && npm run python:setup && gatsby build",
+        "dev": "npm run python:setup && gatsby develop",
+        "dev:nightly": "BRANCH=nightly.spacy.io npm run dev",
        "lint": "eslint **",
        "clear": "rm -rf .cache",
-        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\""
+        "test": "echo \"Write tests! -> https://gatsby.app/unit-testing\"",
+        "python:install": "pip install -r setup/requirements.txt",
+        "python:setup": "cd setup && sh setup.sh"
    },
    "devDependencies": {
+        "@sindresorhus/slugify": "^0.8.0",
        "browser-monads": "^1.0.0",
        "md-attr-parser": "^1.2.1",
        "prettier": "^1.16.4",
        "raw-loader": "^1.0.0",
-        "unist-util-visit": "^1.4.0",
-        "@sindresorhus/slugify": "^0.8.0"
+        "unist-util-visit": "^1.4.0"
    },
    "repository": {
        "type": "git",
--- a/website/src/components/footer.js
+++ b/website/src/components/footer.js
@ -6,7 +6,7 @@ import classNames from 'classnames'
 import Link from './link'
 import Grid from './grid'
 import Newsletter from './newsletter'
-import ExplosionLogo from '-!svg-react-loader!../images/explosion.svg'
+import { ReactComponent as ExplosionLogo } from '../images/explosion.svg'
 import classes from '../styles/footer.module.sass'

 export default function Footer({ wide = false }) {
--- a/website/src/components/icon.js
+++ b/website/src/components/icon.js
@ -1,4 +1,4 @@
-import React from 'react'
+import React, { Fragment } from 'react'
 import PropTypes from 'prop-types'
 import classNames from 'classnames'

@ -19,7 +19,13 @@ import NoIcon from '-!svg-react-loader!../images/icons/no.svg'
 import NeutralIcon from '-!svg-react-loader!../images/icons/neutral.svg'
 import OfflineIcon from '-!svg-react-loader!../images/icons/offline.svg'
 import SearchIcon from '-!svg-react-loader!../images/icons/search.svg'
+import MoonIcon from '-!svg-react-loader!../images/icons/moon.svg'
+import ClipboardIcon from '-!svg-react-loader!../images/icons/clipboard.svg'
+import NetworkIcon from '-!svg-react-loader!../images/icons/network.svg'
+import DownloadIcon from '-!svg-react-loader!../images/icons/download.svg'
+import PackageIcon from '-!svg-react-loader!../images/icons/package.svg'

+import { isString } from './util'
 import classes from '../styles/icon.module.sass'

 const icons = {
@ -41,9 +47,22 @@ const icons = {
    neutral: NeutralIcon,
    offline: OfflineIcon,
    search: SearchIcon,
+    moon: MoonIcon,
+    clipboard: ClipboardIcon,
+    network: NetworkIcon,
+    download: DownloadIcon,
+    package: PackageIcon,
 }

-const Icon = ({ name, width, height, inline, variant, className }) => {
+export default function Icon({
+    name,
+    width = 20,
+    height,
+    inline = false,
+    variant,
+    className,
+    ...props
+}) {
    const IconComponent = icons[name]
    const iconClassNames = classNames(classes.root, className, {
        [classes.inline]: inline,
@ -57,15 +76,11 @@ const Icon = ({ name, width, height, inline, variant, className }) => {
            aria-hidden="true"
            width={width}
            height={height || width}
+            {...props}
        />
    )
 }

-Icon.defaultProps = {
-    width: 20,
-    inline: false,
-}
-
 Icon.propTypes = {
    name: PropTypes.oneOf(Object.keys(icons)),
    width: PropTypes.number,
@ -75,4 +90,43 @@ Icon.propTypes = {
    className: PropTypes.string,
 }

-export default Icon
+export function replaceEmoji(cellChildren) {
+    const icons = {
+        '✅': { name: 'yes', variant: 'success', 'aria-label': 'positive' },
+        '❌': { name: 'no', variant: 'error', 'aria-label': 'negative' },
+    }
+    const iconRe = new RegExp(`^(${Object.keys(icons).join('|')})`, 'g')
+    let children = isString(cellChildren) ? [cellChildren] : cellChildren
+    let hasIcon = false
+    if (Array.isArray(children)) {
+        children = children.map((child, i) => {
+            if (isString(child)) {
+                const icon = icons[child.trim()]
+                if (icon) {
+                    hasIcon = true
+                    return (
+                        <Icon
+                            {...icon}
+                            inline={i < children.length}
+                            aria-hidden={undefined}
+                            key={i}
+                        />
+                    )
+                } else if (iconRe.test(child)) {
+                    hasIcon = true
+                    const [, iconName, text] = child.split(iconRe)
+                    return (
+                        <Fragment key={i}>
+                            <Icon {...icons[iconName]} aria-hidden={undefined} inline={true} />
+                            {text.replace(/^\s+/g, '')}
+                        </Fragment>
+                    )
+                }
+                // Work around prettier auto-escape
+                if (child.startsWith('\\')) return child.slice(1)
+            }
+            return child
+        })
+    }
+    return { content: children, hasIcon }
+}
--- a/website/src/components/navigation.js
+++ b/website/src/components/navigation.js
@ -6,7 +6,7 @@ import Link from './link'
 import Icon from './icon'
 import Dropdown from './dropdown'
 import { github } from './util'
-import Logo from '-!svg-react-loader!../images/logo.svg'
+import { ReactComponent as Logo } from '../images/logo.svg'
 import classes from '../styles/navigation.module.sass'

 const NavigationDropdown = ({ items = [], section }) => {
--- a/website/src/images/logos/index.js
+++ b/website/src/images/logos/index.js
@ -1,31 +0,0 @@
-import AirbnbLogo from '-!svg-react-loader!./airbnb.svg'
-import UberLogo from '-!svg-react-loader!./uber.svg'
-import QuoraLogo from '-!svg-react-loader!./quora.svg'
-import RetrieverLogo from '-!svg-react-loader!./retriever.svg'
-import StitchfixLogo from '-!svg-react-loader!./stitchfix.svg'
-import ChartbeatLogo from '-!svg-react-loader!./chartbeat.svg'
-import AllenAILogo from '-!svg-react-loader!./allenai.svg'
-
-import RecodeLogo from '-!svg-react-loader!./recode.svg'
-import WapoLogo from '-!svg-react-loader!./wapo.svg'
-import BBCLogo from '-!svg-react-loader!./bbc.svg'
-import MicrosoftLogo from '-!svg-react-loader!./microsoft.svg'
-import VenturebeatLogo from '-!svg-react-loader!./venturebeat.svg'
-import ThoughtworksLogo from '-!svg-react-loader!./thoughtworks.svg'
-
-export default {
-    airbnb: AirbnbLogo,
-    uber: UberLogo,
-    quora: QuoraLogo,
-    retriever: RetrieverLogo,
-    stitchfix: StitchfixLogo,
-    chartbeat: ChartbeatLogo,
-    allenai: AllenAILogo,
-
-    recode: RecodeLogo,
-    wapo: WapoLogo,
-    bbc: BBCLogo,
-    microsoft: MicrosoftLogo,
-    venturebeat: VenturebeatLogo,
-    thoughtworks: ThoughtworksLogo,
-}
--- a/website/src/widgets/styleguide.js
+++ b/website/src/widgets/styleguide.js
@ -4,7 +4,7 @@ import Grid from '../components/grid'
 import { Label } from '../components/typography'
 import Link from '../components/link'

-import Logo from '-!svg-react-loader!../images/logo.svg'
+import { ReactComponent as Logo } from '../images/logo.svg'
 import patternBlue from '../images/pattern_blue.jpg'
 import patternGreen from '../images/pattern_green.jpg'
 import patternPurple from '../images/pattern_purple.jpg'