mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
spacy.ngram_range_suggester.v1 (#8699)
This commit is contained in:
parent
e117573822
commit
77859beb99
|
@ -78,6 +78,15 @@ def build_ngram_suggester(sizes: List[int]) -> Callable[[List[Doc]], Ragged]:
|
|||
return ngram_suggester
|
||||
|
||||
|
||||
@registry.misc("spacy.ngram_range_suggester.v1")
|
||||
def build_ngram_range_suggester(min_size: int, max_size: int) -> Callable[[List[Doc]], Ragged]:
|
||||
"""Suggest all spans of the given lengths between a given min and max value - both inclusive.
|
||||
Spans are returned as a ragged array of integers. The array has two columns,
|
||||
indicating the start and end position."""
|
||||
sizes = range(min_size, max_size+1)
|
||||
return build_ngram_suggester(sizes)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"spancat",
|
||||
assigns=["doc.spans"],
|
||||
|
|
|
@ -183,3 +183,22 @@ def test_ngram_suggester(en_tokenizer):
|
|||
docs = [en_tokenizer(text) for text in ["", "", ""]]
|
||||
ngrams = ngram_suggester(docs)
|
||||
assert_equal(ngrams.lengths, [len(doc) for doc in docs])
|
||||
|
||||
|
||||
def test_ngram_sizes(en_tokenizer):
|
||||
# test that the range suggester works well
|
||||
size_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2, 3])
|
||||
range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(min_size=1, max_size=3)
|
||||
docs = [
|
||||
en_tokenizer(text) for text in ["a", "a b", "a b c", "a b c d", "a b c d e"]
|
||||
]
|
||||
ngrams_1 = size_suggester(docs)
|
||||
ngrams_2 = range_suggester(docs)
|
||||
assert_equal(ngrams_1.lengths, [1, 3, 6, 9, 12])
|
||||
assert_equal(ngrams_1.lengths, ngrams_2.lengths)
|
||||
assert_equal(ngrams_1.data, ngrams_2.data)
|
||||
|
||||
# one more variation
|
||||
range_suggester = registry.misc.get("spacy.ngram_range_suggester.v1")(min_size=2, max_size=4)
|
||||
ngrams_3 = range_suggester(docs)
|
||||
assert_equal(ngrams_3.lengths, [0, 1, 3, 6, 9])
|
||||
|
|
|
@ -451,3 +451,24 @@ integers. The array has two columns, indicating the start and end position.
|
|||
| ----------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `sizes` | The phrase lengths to suggest. For example, `[1, 2]` will suggest phrases consisting of 1 or 2 tokens. ~~List[int]~~ |
|
||||
| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ |
|
||||
|
||||
### spacy.ngram_range_suggester.v1 {#ngram_range_suggester}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [components.spancat.suggester]
|
||||
> @misc = "spacy.ngram_range_suggester.v1"
|
||||
> min_size = 2
|
||||
> max_size = 4
|
||||
> ```
|
||||
|
||||
Suggest all spans of at least length `min_size` and at most length `max_size`
|
||||
(both inclusive). Spans are returned as a ragged array of integers. The array
|
||||
has two columns, indicating the start and end position.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------ |
|
||||
| `min_size` | The minimal phrase lengths to suggest (inclusive). ~~[int]~~ |
|
||||
| `max_size` | The maximal phrase lengths to suggest (exclusive). ~~[int]~~ |
|
||||
| **CREATES** | The suggester function. ~~Callable[[List[Doc]], Ragged]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user