From bf0cdae8d41f2cc458739c2512c8bdb625757770 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 17 Jan 2021 12:54:41 +0100 Subject: [PATCH] Add token_splitter component (#6726) * Add long_token_splitter component Add a `long_token_splitter` component for use with transformer pipelines. This component splits up long tokens like URLs into smaller tokens. This is particularly relevant for pretrained pipelines with `strided_spans`, since the user can't change the length of the span `window` and may not wish to preprocess the input texts. The `long_token_splitter` splits tokens that are at least `long_token_length` tokens long into smaller tokens of `split_length` size. Notes: * Since this is intended for use as the first component in a pipeline, the token splitter does not try to preserve any token annotation. * API docs to come when the API is stable. * Adjust API, add test * Fix name in factory --- spacy/pipeline/functions.py | 78 +++++++++++++++++++ spacy/tests/pipeline/test_functions.py | 21 +++++ website/docs/api/pipeline-functions.md | 23 ++++++ website/docs/usage/embeddings-transformers.md | 44 +++++++++++ 4 files changed, 166 insertions(+) diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 614608b25..d955e970d 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,7 +1,11 @@ +import srsly +from thinc.api import Config +from typing import Dict, Any from ..language import Language from ..matcher import Matcher from ..tokens import Doc from ..util import filter_spans +from .. import util @Language.component( @@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc: for span in spans: retokenizer.merge(span) return doc + + +@Language.factory( + "token_splitter", + default_config={"min_length": 25, "split_length": 10}, + retokenizes=True, +) +def make_token_splitter( + nlp: Language, + name: str, + *, + min_length=0, + split_length=0, +): + return TokenSplitter( + min_length=min_length, split_length=split_length + ) + + +class TokenSplitter: + def __init__(self, min_length: int = 0, split_length: int = 0): + self.min_length = min_length + self.split_length = split_length + + def __call__(self, doc: Doc) -> Doc: + if self.min_length > 0 and self.split_length > 0: + with doc.retokenize() as retokenizer: + for t in doc: + if len(t.text) >= self.min_length: + orths = [] + heads = [] + attrs = {} + for i in range(0, len(t.text), self.split_length): + orths.append(t.text[i : i + self.split_length]) + heads.append((t, i / self.split_length)) + retokenizer.split(t, orths, heads, attrs) + return doc + + def _get_config(self) -> Dict[str, Any]: + return { + "min_length": self.min_length, + "split_length": self.split_length, + } + + def _set_config(self, config: Dict[str, Any] = {}) -> None: + self.min_length = config.get("min_length", 0) + self.split_length = config.get("split_length", 0) + + def to_bytes(self, **kwargs): + serializers = { + "cfg": lambda: srsly.json_dumps(self._get_config()), + } + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **kwargs): + deserializers = { + "cfg": lambda b: self._set_config(srsly.json_loads(b)), + } + util.from_bytes(data, deserializers, []) + return self + + def to_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = { + "cfg": lambda p: srsly.write_json(p, self._get_config()), + } + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **kwargs): + path = util.ensure_path(path) + serializers = { + "cfg": lambda p: self._set_config(srsly.read_json(p)), + } + util.from_disk(path, serializers, []) diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py index 025ac04af..454d7b08b 100644 --- a/spacy/tests/pipeline/test_functions.py +++ b/spacy/tests/pipeline/test_functions.py @@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2): assert len(doc2) == 6 assert len(list(doc2.ents)) == 1 assert doc2[2].text == "New York" + + +def test_token_splitter(): + nlp = Language() + config = {"min_length": 20, "split_length": 5} + token_splitter = nlp.add_pipe("token_splitter", config=config) + doc = nlp("aaaaabbbbbcccccdddd e f g") + assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"] + doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i") + assert [t.text for t in doc] == [ + "aaaaa", + "bbbbb", + "ccccc", + "ddddd", + "eeeee", + "ff", + "g", + "h", + "i", + ] + assert all(len(t.text) <= token_splitter.split_length for t in doc) diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md index 0dc03a16a..74a67f319 100644 --- a/website/docs/api/pipeline-functions.md +++ b/website/docs/api/pipeline-functions.md @@ -6,6 +6,7 @@ menu: - ['merge_noun_chunks', 'merge_noun_chunks'] - ['merge_entities', 'merge_entities'] - ['merge_subtokens', 'merge_subtokens'] + - ['token_splitter', 'token_splitter'] --- ## merge_noun_chunks {#merge_noun_chunks tag="function"} @@ -107,3 +108,25 @@ end of the pipeline and after all other components. | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ | | `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ | | **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ | + +## token_splitter {#token_splitter tag="function" new="3.0"} + +Split tokens longer than a minimum length into shorter tokens. Intended for use +with transformer pipelines where long spaCy tokens lead to input text that +exceed the transformer model max length. See +[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length). + +> #### Example +> +> ```python +> config={"min_length": 20, "split_length": 5} +> nlp.add_pipe("token_splitter", config=config, first=True) +> doc = nlp("aaaaabbbbbcccccdddddee") +> print([token.text for token in doc]) +> # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee'] +> ``` + +| Setting | Description | +| -------------- | --------------------------------------------------------------------- | +| `min_length` | The minimum length for a token to be split. Defaults to `25`. ~~int~~ | +| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~ | diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 7e47ac9d2..fdf15d187 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also provide a schedule, allowing you to freeze the shared parameters at the start of training. +### Managing transformer model max length limitations {#transformer-max-length} + +Many transformer models have a limit on the maximum number of tokens that the +model can process, for example BERT models are limited to 512 tokens. This limit +refers to the number of transformer tokens (BPE, WordPiece, etc.), not the +number of spaCy tokens. + +To be able to process longer texts, the spaCy [`transformer`](/api/transformer) +component uses [`span_getters`](/api/transformer#span_getters) to convert a +batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A +span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or +a window of spaCy tokens (`strided_spans`). If a single span corresponds to more +transformer tokens than the transformer model supports, the spaCy pipeline can't +process the text because some spaCy tokens would be left without an analysis. + +In general, it is up to the transformer pipeline user to manage the input texts +so that the model max length is not exceeded. If you're training a **new +pipeline**, you have a number of options to handle the max length limit: + +- Use `doc_spans` with short texts only +- Use `sent_spans` with short sentences only +- For `strided_spans`, lower the `window` size to be short enough for your input + texts (and don't forget to lower the `stride` correspondingly) +- Implement a [custom span getter](#transformers-training-custom-settings) + +You may still run into the max length limit if a single spaCy token is very +long, like a long URL or a noisy string, or if you're using a **pretrained +pipeline** like `en_core_web_trf` with a fixed `window` size for +`strided_spans`. In this case, you need to modify either your texts or your +pipeline so that you have shorter spaCy tokens. Some options: + +- Preprocess your texts to clean up noise and split long tokens with whitespace +- Add a `token_splitter` to the beginning of your pipeline to break up + tokens that are longer than a specified length: + + ```python + config={"min_length": 20, "split_length": 5} + nlp.add_pipe("token_splitter", config=config, first=True) + ``` + + In this example, tokens that are at least 20 characters long will be split up + into smaller tokens of 5 characters each, resulting in strided spans that + correspond to fewer transformer tokens. + ## Static vectors {#static-vectors} If your pipeline includes a **word vectors table**, you'll be able to use the