From bf0cdae8d41f2cc458739c2512c8bdb625757770 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sun, 17 Jan 2021 12:54:41 +0100
Subject: [PATCH] Add token_splitter component (#6726)

* Add long_token_splitter component

Add a `long_token_splitter` component for use with transformer
pipelines. This component splits up long tokens like URLs into smaller
tokens. This is particularly relevant for pretrained pipelines with
`strided_spans`, since the user can't change the length of the span
`window` and may not wish to preprocess the input texts.

The `long_token_splitter` splits tokens that are at least
`long_token_length` tokens long into smaller tokens of `split_length`
size.

Notes:

* Since this is intended for use as the first component in a pipeline,
the token splitter does not try to preserve any token annotation.
* API docs to come when the API is stable.

* Adjust API, add test

* Fix name in factory
---
 spacy/pipeline/functions.py                   | 78 +++++++++++++++++++
 spacy/tests/pipeline/test_functions.py        | 21 +++++
 website/docs/api/pipeline-functions.md        | 23 ++++++
 website/docs/usage/embeddings-transformers.md | 44 +++++++++++
 4 files changed, 166 insertions(+)

diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 614608b25..d955e970d 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,7 +1,11 @@
+import srsly
+from thinc.api import Config
+from typing import Dict, Any
 from ..language import Language
 from ..matcher import Matcher
 from ..tokens import Doc
 from ..util import filter_spans
+from .. import util
 
 
 @Language.component(
@@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
         for span in spans:
             retokenizer.merge(span)
     return doc
+
+
+@Language.factory(
+    "token_splitter",
+    default_config={"min_length": 25, "split_length": 10},
+    retokenizes=True,
+)
+def make_token_splitter(
+    nlp: Language,
+    name: str,
+    *,
+    min_length=0,
+    split_length=0,
+):
+    return TokenSplitter(
+        min_length=min_length, split_length=split_length
+    )
+
+
+class TokenSplitter:
+    def __init__(self, min_length: int = 0, split_length: int = 0):
+        self.min_length = min_length
+        self.split_length = split_length
+
+    def __call__(self, doc: Doc) -> Doc:
+        if self.min_length > 0 and self.split_length > 0:
+            with doc.retokenize() as retokenizer:
+                for t in doc:
+                    if len(t.text) >= self.min_length:
+                        orths = []
+                        heads = []
+                        attrs = {}
+                        for i in range(0, len(t.text), self.split_length):
+                            orths.append(t.text[i : i + self.split_length])
+                            heads.append((t, i / self.split_length))
+                        retokenizer.split(t, orths, heads, attrs)
+        return doc
+
+    def _get_config(self) -> Dict[str, Any]:
+        return {
+            "min_length": self.min_length,
+            "split_length": self.split_length,
+        }
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.min_length = config.get("min_length", 0)
+        self.split_length = config.get("split_length", 0)
+
+    def to_bytes(self, **kwargs):
+        serializers = {
+            "cfg": lambda: srsly.json_dumps(self._get_config()),
+        }
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data, **kwargs):
+        deserializers = {
+            "cfg": lambda b: self._set_config(srsly.json_loads(b)),
+        }
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path, **kwargs):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: srsly.write_json(p, self._get_config()),
+        }
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path, **kwargs):
+        path = util.ensure_path(path)
+        serializers = {
+            "cfg": lambda p: self._set_config(srsly.read_json(p)),
+        }
+        util.from_disk(path, serializers, [])
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
index 025ac04af..454d7b08b 100644
--- a/spacy/tests/pipeline/test_functions.py
+++ b/spacy/tests/pipeline/test_functions.py
@@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2):
     assert len(doc2) == 6
     assert len(list(doc2.ents)) == 1
     assert doc2[2].text == "New York"
+
+
+def test_token_splitter():
+    nlp = Language()
+    config = {"min_length": 20, "split_length": 5}
+    token_splitter = nlp.add_pipe("token_splitter", config=config)
+    doc = nlp("aaaaabbbbbcccccdddd e f g")
+    assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"]
+    doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i")
+    assert [t.text for t in doc] == [
+        "aaaaa",
+        "bbbbb",
+        "ccccc",
+        "ddddd",
+        "eeeee",
+        "ff",
+        "g",
+        "h",
+        "i",
+    ]
+    assert all(len(t.text) <= token_splitter.split_length for t in doc)
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 0dc03a16a..74a67f319 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -6,6 +6,7 @@ menu:
   - ['merge_noun_chunks', 'merge_noun_chunks']
   - ['merge_entities', 'merge_entities']
   - ['merge_subtokens', 'merge_subtokens']
+  - ['token_splitter', 'token_splitter']
 ---
 
 ## merge_noun_chunks {#merge_noun_chunks tag="function"}
@@ -107,3 +108,25 @@ end of the pipeline and after all other components.
 | `doc`       | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
 | `label`     | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~       |
 | **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~                    |
+
+## token_splitter {#token_splitter tag="function" new="3.0"}
+
+Split tokens longer than a minimum length into shorter tokens. Intended for use
+with transformer pipelines where long spaCy tokens lead to input text that
+exceed the transformer model max length. See
+[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length).
+
+> #### Example
+>
+> ```python
+> config={"min_length": 20, "split_length": 5}
+> nlp.add_pipe("token_splitter", config=config, first=True)
+> doc = nlp("aaaaabbbbbcccccdddddee")
+> print([token.text for token in doc])
+> # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee']
+> ```
+
+| Setting        | Description                                                           |
+| -------------- | --------------------------------------------------------------------- |
+| `min_length`   | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
+| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~              |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 7e47ac9d2..fdf15d187 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also
 provide a schedule, allowing you to freeze the shared parameters at the start of
 training.
 
+### Managing transformer model max length limitations {#transformer-max-length}
+
+Many transformer models have a limit on the maximum number of tokens that the
+model can process, for example BERT models are limited to 512 tokens. This limit
+refers to the number of transformer tokens (BPE, WordPiece, etc.), not the
+number of spaCy tokens.
+
+To be able to process longer texts, the spaCy [`transformer`](/api/transformer)
+component uses [`span_getters`](/api/transformer#span_getters) to convert a
+batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A
+span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or
+a window of spaCy tokens (`strided_spans`). If a single span corresponds to more
+transformer tokens than the transformer model supports, the spaCy pipeline can't
+process the text because some spaCy tokens would be left without an analysis.
+
+In general, it is up to the transformer pipeline user to manage the input texts
+so that the model max length is not exceeded. If you're training a **new
+pipeline**, you have a number of options to handle the max length limit:
+
+- Use `doc_spans` with short texts only
+- Use `sent_spans` with short sentences only
+- For `strided_spans`, lower the `window` size to be short enough for your input
+  texts (and don't forget to lower the `stride` correspondingly)
+- Implement a [custom span getter](#transformers-training-custom-settings)
+
+You may still run into the max length limit if a single spaCy token is very
+long, like a long URL or a noisy string, or if you're using a **pretrained
+pipeline** like `en_core_web_trf` with a fixed `window` size for
+`strided_spans`. In this case, you need to modify either your texts or your
+pipeline so that you have shorter spaCy tokens. Some options:
+
+- Preprocess your texts to clean up noise and split long tokens with whitespace
+- Add a `token_splitter` to the beginning of your pipeline to break up
+  tokens that are longer than a specified length:
+
+  ```python
+  config={"min_length": 20, "split_length": 5}
+  nlp.add_pipe("token_splitter", config=config, first=True)
+  ```
+
+  In this example, tokens that are at least 20 characters long will be split up
+  into smaller tokens of 5 characters each, resulting in strided spans that
+  correspond to fewer transformer tokens.
+
 ## Static vectors {#static-vectors}
 
 If your pipeline includes a **word vectors table**, you'll be able to use the