diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index 524e3a659..162532b05 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -78,6 +78,15 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]: return ngram_suggester +@registry.misc("spacy.ngram_range_suggester.v1") +def build_ngram_range_suggester(min_size: int, max_size: int) -> Callable[[List[Doc]], Ragged]: + """Suggest all spans of the given lengths between a given min and max value - both inclusive. + Spans are returned as a ragged array of integers. The array has two columns, + indicating the start and end position.""" + sizes = range(min_size, max_size+1) + return build_ngram_suggester(sizes) + + @Language.factory( "spancat", assigns=["doc.spans"], diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index 5345a4749..98f499065 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -183,3 +183,22 @@ def test_ngram_suggester(en_tokenizer): docs = [en_tokenizer(text) for text in ["", "", ""]] ngrams = ngram_suggester(docs) assert_equal(ngrams.lengths, [len(doc) for doc in docs]) + + +def test_ngram_sizes(en_tokenizer): + # test that the range suggester works well + size_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3]) + range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(min_size=1, max_size=3) + docs = [ + en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"] + ] + ngrams_1 = size_suggester(docs) + ngrams_2 = range_suggester(docs) + assert_equal(ngrams_1.lengths, [1, 3, 6, 9, 12]) + assert_equal(ngrams_1.lengths, ngrams_2.lengths) + assert_equal(ngrams_1.data, ngrams_2.data) + + # one more variation + range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(min_size=2, max_size=4) + ngrams_3 = range_suggester(docs) + assert_equal(ngrams_3.lengths, [0, 1, 3, 6, 9]) diff --git a/website/docs/api/spancategorizer.md b/website/docs/api/spancategorizer.md index f26dba149..57395846d 100644 --- a/website/docs/api/spancategorizer.md +++ b/website/docs/api/spancategorizer.md @@ -451,3 +451,24 @@ integers. The array has two columns, indicating the start and end position. | ----------- | -------------------------------------------------------------------------------------------------------------------- | | `sizes` | The phrase lengths to suggest. For example, `[1, 2]` will suggest phrases consisting of 1 or 2 tokens. ~~List[int]~~ | | **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ | + +### spacy.ngram_range_suggester.v1 {#ngram_range_suggester} + +> #### Example Config +> +> ```ini +> [components.spancat.suggester] +> @misc = "spacy.ngram_range_suggester.v1" +> min_size = 2 +> max_size = 4 +> ``` + +Suggest all spans of at least length `min_size` and at most length `max_size` +(both inclusive). Spans are returned as a ragged array of integers. The array +has two columns, indicating the start and end position. + +| Name | Description | +| ----------- | ------------------------------------------------------------ | +| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ | +| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ | +| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ |