mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 21:24:11 +03:00
Add token_splitter component (#6726)
* Add long_token_splitter component Add a `long_token_splitter` component for use with transformer pipelines. This component splits up long tokens like URLs into smaller tokens. This is particularly relevant for pretrained pipelines with `strided_spans`, since the user can't change the length of the span `window` and may not wish to preprocess the input texts. The `long_token_splitter` splits tokens that are at least `long_token_length` tokens long into smaller tokens of `split_length` size. Notes: * Since this is intended for use as the first component in a pipeline, the token splitter does not try to preserve any token annotation. * API docs to come when the API is stable. * Adjust API, add test * Fix name in factory
This commit is contained in:
parent
185fc62f4d
commit
bf0cdae8d4
|
@ -1,7 +1,11 @@
|
|||
import srsly
|
||||
from thinc.api import Config
|
||||
from typing import Dict, Any
|
||||
from ..language import Language
|
||||
from ..matcher import Matcher
|
||||
from ..tokens import Doc
|
||||
from ..util import filter_spans
|
||||
from .. import util
|
||||
|
||||
|
||||
@Language.component(
|
||||
|
@ -65,3 +69,77 @@ def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
|
|||
for span in spans:
|
||||
retokenizer.merge(span)
|
||||
return doc
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"token_splitter",
|
||||
default_config={"min_length": 25, "split_length": 10},
|
||||
retokenizes=True,
|
||||
)
|
||||
def make_token_splitter(
|
||||
nlp: Language,
|
||||
name: str,
|
||||
*,
|
||||
min_length=0,
|
||||
split_length=0,
|
||||
):
|
||||
return TokenSplitter(
|
||||
min_length=min_length, split_length=split_length
|
||||
)
|
||||
|
||||
|
||||
class TokenSplitter:
|
||||
def __init__(self, min_length: int = 0, split_length: int = 0):
|
||||
self.min_length = min_length
|
||||
self.split_length = split_length
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
if self.min_length > 0 and self.split_length > 0:
|
||||
with doc.retokenize() as retokenizer:
|
||||
for t in doc:
|
||||
if len(t.text) >= self.min_length:
|
||||
orths = []
|
||||
heads = []
|
||||
attrs = {}
|
||||
for i in range(0, len(t.text), self.split_length):
|
||||
orths.append(t.text[i : i + self.split_length])
|
||||
heads.append((t, i / self.split_length))
|
||||
retokenizer.split(t, orths, heads, attrs)
|
||||
return doc
|
||||
|
||||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"min_length": self.min_length,
|
||||
"split_length": self.split_length,
|
||||
}
|
||||
|
||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||
self.min_length = config.get("min_length", 0)
|
||||
self.split_length = config.get("split_length", 0)
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
serializers = {
|
||||
"cfg": lambda: srsly.json_dumps(self._get_config()),
|
||||
}
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data, **kwargs):
|
||||
deserializers = {
|
||||
"cfg": lambda b: self._set_config(srsly.json_loads(b)),
|
||||
}
|
||||
util.from_bytes(data, deserializers, [])
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
path = util.ensure_path(path)
|
||||
serializers = {
|
||||
"cfg": lambda p: srsly.write_json(p, self._get_config()),
|
||||
}
|
||||
return util.to_disk(path, serializers, [])
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
path = util.ensure_path(path)
|
||||
serializers = {
|
||||
"cfg": lambda p: self._set_config(srsly.read_json(p)),
|
||||
}
|
||||
util.from_disk(path, serializers, [])
|
||||
|
|
|
@ -53,3 +53,24 @@ def test_factories_merge_ents(doc2):
|
|||
assert len(doc2) == 6
|
||||
assert len(list(doc2.ents)) == 1
|
||||
assert doc2[2].text == "New York"
|
||||
|
||||
|
||||
def test_token_splitter():
|
||||
nlp = Language()
|
||||
config = {"min_length": 20, "split_length": 5}
|
||||
token_splitter = nlp.add_pipe("token_splitter", config=config)
|
||||
doc = nlp("aaaaabbbbbcccccdddd e f g")
|
||||
assert [t.text for t in doc] == ["aaaaabbbbbcccccdddd", "e", "f", "g"]
|
||||
doc = nlp("aaaaabbbbbcccccdddddeeeeeff g h i")
|
||||
assert [t.text for t in doc] == [
|
||||
"aaaaa",
|
||||
"bbbbb",
|
||||
"ccccc",
|
||||
"ddddd",
|
||||
"eeeee",
|
||||
"ff",
|
||||
"g",
|
||||
"h",
|
||||
"i",
|
||||
]
|
||||
assert all(len(t.text) <= token_splitter.split_length for t in doc)
|
||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
|||
- ['merge_noun_chunks', 'merge_noun_chunks']
|
||||
- ['merge_entities', 'merge_entities']
|
||||
- ['merge_subtokens', 'merge_subtokens']
|
||||
- ['token_splitter', 'token_splitter']
|
||||
---
|
||||
|
||||
## merge_noun_chunks {#merge_noun_chunks tag="function"}
|
||||
|
@ -107,3 +108,25 @@ end of the pipeline and after all other components.
|
|||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||
| `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ |
|
||||
| **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ |
|
||||
|
||||
## token_splitter {#token_splitter tag="function" new="3.0"}
|
||||
|
||||
Split tokens longer than a minimum length into shorter tokens. Intended for use
|
||||
with transformer pipelines where long spaCy tokens lead to input text that
|
||||
exceed the transformer model max length. See
|
||||
[managing transformer model max length limitations](/usage/embeddings-transformers#transformer-max-length).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> config={"min_length": 20, "split_length": 5}
|
||||
> nlp.add_pipe("token_splitter", config=config, first=True)
|
||||
> doc = nlp("aaaaabbbbbcccccdddddee")
|
||||
> print([token.text for token in doc])
|
||||
> # ['aaaaa', 'bbbbb', 'ccccc', 'ddddd', 'ee']
|
||||
> ```
|
||||
|
||||
| Setting | Description |
|
||||
| -------------- | --------------------------------------------------------------------- |
|
||||
| `min_length` | The minimum length for a token to be split. Defaults to `25`. ~~int~~ |
|
||||
| `split_length` | The length of the split tokens. Defaults to `5`. ~~int~~ |
|
||||
|
|
|
@ -481,6 +481,50 @@ custom learning rate for each component. Instead of a constant, you can also
|
|||
provide a schedule, allowing you to freeze the shared parameters at the start of
|
||||
training.
|
||||
|
||||
### Managing transformer model max length limitations {#transformer-max-length}
|
||||
|
||||
Many transformer models have a limit on the maximum number of tokens that the
|
||||
model can process, for example BERT models are limited to 512 tokens. This limit
|
||||
refers to the number of transformer tokens (BPE, WordPiece, etc.), not the
|
||||
number of spaCy tokens.
|
||||
|
||||
To be able to process longer texts, the spaCy [`transformer`](/api/transformer)
|
||||
component uses [`span_getters`](/api/transformer#span_getters) to convert a
|
||||
batch of [`Doc`](/api/doc) objects into lists of [`Span`](/api/span) objects. A
|
||||
span may correspond to a doc (for `doc_spans`), a sentence (for `sent_spans`) or
|
||||
a window of spaCy tokens (`strided_spans`). If a single span corresponds to more
|
||||
transformer tokens than the transformer model supports, the spaCy pipeline can't
|
||||
process the text because some spaCy tokens would be left without an analysis.
|
||||
|
||||
In general, it is up to the transformer pipeline user to manage the input texts
|
||||
so that the model max length is not exceeded. If you're training a **new
|
||||
pipeline**, you have a number of options to handle the max length limit:
|
||||
|
||||
- Use `doc_spans` with short texts only
|
||||
- Use `sent_spans` with short sentences only
|
||||
- For `strided_spans`, lower the `window` size to be short enough for your input
|
||||
texts (and don't forget to lower the `stride` correspondingly)
|
||||
- Implement a [custom span getter](#transformers-training-custom-settings)
|
||||
|
||||
You may still run into the max length limit if a single spaCy token is very
|
||||
long, like a long URL or a noisy string, or if you're using a **pretrained
|
||||
pipeline** like `en_core_web_trf` with a fixed `window` size for
|
||||
`strided_spans`. In this case, you need to modify either your texts or your
|
||||
pipeline so that you have shorter spaCy tokens. Some options:
|
||||
|
||||
- Preprocess your texts to clean up noise and split long tokens with whitespace
|
||||
- Add a `token_splitter` to the beginning of your pipeline to break up
|
||||
tokens that are longer than a specified length:
|
||||
|
||||
```python
|
||||
config={"min_length": 20, "split_length": 5}
|
||||
nlp.add_pipe("token_splitter", config=config, first=True)
|
||||
```
|
||||
|
||||
In this example, tokens that are at least 20 characters long will be split up
|
||||
into smaller tokens of 5 characters each, resulting in strided spans that
|
||||
correspond to fewer transformer tokens.
|
||||
|
||||
## Static vectors {#static-vectors}
|
||||
|
||||
If your pipeline includes a **word vectors table**, you'll be able to use the
|
||||
|
|
Loading…
Reference in New Issue
Block a user