spacy.ngram_range_suggester.v1 (#8699)

This commit is contained in:
Sofie Van Landeghem 2021-07-15 10:01:22 +02:00 committed by GitHub
parent e117573822
commit 77859beb99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 0 deletions

View File

@ -78,6 +78,15 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
return ngram_suggester
@registry.misc("spacy.ngram_range_suggester.v1")
def build_ngram_range_suggester(min_size: int, max_size: int) -> Callable[[List[Doc]], Ragged]:
"""Suggest all spans of the given lengths between a given min and max value - both inclusive.
Spans are returned as a ragged array of integers. The array has two columns,
indicating the start and end position."""
sizes = range(min_size, max_size+1)
return build_ngram_suggester(sizes)
@Language.factory(
"spancat",
assigns=["doc.spans"],

View File

@ -183,3 +183,22 @@ def test_ngram_suggester(en_tokenizer):
docs = [en_tokenizer(text) for text in ["", "", ""]]
ngrams = ngram_suggester(docs)
assert_equal(ngrams.lengths, [len(doc) for doc in docs])
def test_ngram_sizes(en_tokenizer):
# test that the range suggester works well
size_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3])
range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(min_size=1, max_size=3)
docs = [
en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"]
]
ngrams_1 = size_suggester(docs)
ngrams_2 = range_suggester(docs)
assert_equal(ngrams_1.lengths, [1, 3, 6, 9, 12])
assert_equal(ngrams_1.lengths, ngrams_2.lengths)
assert_equal(ngrams_1.data, ngrams_2.data)
# one more variation
range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(min_size=2, max_size=4)
ngrams_3 = range_suggester(docs)
assert_equal(ngrams_3.lengths, [0, 1, 3, 6, 9])

View File

@ -451,3 +451,24 @@ integers. The array has two columns, indicating the start and end position.
| ----------- | -------------------------------------------------------------------------------------------------------------------- |
| `sizes` | The phrase lengths to suggest. For example, `[1, 2]` will suggest phrases consisting of 1 or 2 tokens. ~~List[int]~~ |
| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ |
### spacy.ngram_range_suggester.v1 {#ngram_range_suggester}
> #### Example Config
>
> ```ini
> [components.spancat.suggester]
> @misc = "spacy.ngram_range_suggester.v1"
> min_size = 2
> max_size = 4
> ```
Suggest all spans of at least length `min_size` and at most length `max_size`
(both inclusive). Spans are returned as a ragged array of integers. The array
has two columns, indicating the start and end position.
| Name | Description |
| ----------- | ------------------------------------------------------------ |
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ |