mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Addition of min_max quantifier in matcher {n,m} (#10981)
* Min_max_operators 1. Modified API and Usage for spaCy website to include min_max operator 2. Modified matcher.pyx to include min_max function {n,m} and its variants 3. Modified schemas.py to include min_max validation error 4. Added test cases to test_matcher_api.py, test_matcher_logic.py and test_pattern_validation.py * attempt to fix mypy/pydantic compat issue * formatting * Update spacy/tests/matcher/test_pattern_validation.py Co-authored-by: Source-Shen <82353723+Source-Shen@users.noreply.github.com> Co-authored-by: svlandeg <svlandeg@github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
4581a4f53f
commit
be00db6645
|
@ -86,10 +86,14 @@ cdef class Matcher:
|
||||||
is a dictionary mapping attribute IDs to values, and optionally a
|
is a dictionary mapping attribute IDs to values, and optionally a
|
||||||
quantifier operator under the key "op". The available quantifiers are:
|
quantifier operator under the key "op". The available quantifiers are:
|
||||||
|
|
||||||
'!': Negate the pattern, by requiring it to match exactly 0 times.
|
'!': Negate the pattern, by requiring it to match exactly 0 times.
|
||||||
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||||
'+': Require the pattern to match 1 or more times.
|
'+': Require the pattern to match 1 or more times.
|
||||||
'*': Allow the pattern to zero or more times.
|
'*': Allow the pattern to zero or more times.
|
||||||
|
'{n}': Require the pattern to match exactly _n_ times.
|
||||||
|
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
|
||||||
|
'{n,}': Require the pattern to match at least _n_ times.
|
||||||
|
'{,m}': Require the pattern to match at most _m_ times.
|
||||||
|
|
||||||
The + and * operators return all possible matches (not just the greedy
|
The + and * operators return all possible matches (not just the greedy
|
||||||
ones). However, the "greedy" argument can filter the final matches
|
ones). However, the "greedy" argument can filter the final matches
|
||||||
|
@ -1004,8 +1008,29 @@ def _get_operators(spec):
|
||||||
return (ONE,)
|
return (ONE,)
|
||||||
elif spec["OP"] in lookup:
|
elif spec["OP"] in lookup:
|
||||||
return lookup[spec["OP"]]
|
return lookup[spec["OP"]]
|
||||||
|
#Min_max {n,m}
|
||||||
|
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
||||||
|
# {n} --> {n,n} exactly n ONE,(n)
|
||||||
|
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
||||||
|
# {,m} --> {0,m} min of zero, max of m ZERO_ONE,(m)
|
||||||
|
# {n,} --> {n,∞} min of n, max of inf ONE,(n),ZERO_PLUS
|
||||||
|
|
||||||
|
min_max = spec["OP"][1:-1]
|
||||||
|
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
||||||
|
n, m = min_max.split(",")
|
||||||
|
|
||||||
|
#1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||||
|
#2. Both are numeric and n <= m
|
||||||
|
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
||||||
|
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||||
|
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||||
|
|
||||||
|
# if n is empty string, zero would be used
|
||||||
|
head = tuple(ONE for __ in range(int(n or 0)))
|
||||||
|
tail = tuple(ZERO_ONE for __ in range(int(m) - int(n or 0))) if m else (ZERO_PLUS,)
|
||||||
|
return head + tail
|
||||||
else:
|
else:
|
||||||
keys = ", ".join(lookup.keys())
|
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,13 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
|
||||||
from .compat import Literal
|
from .compat import Literal
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
|
||||||
from pydantic.main import ModelMetaclass
|
from pydantic.main import ModelMetaclass
|
||||||
from thinc.api import Optimizer, ConfigValidationError, Model
|
from thinc.api import Optimizer, ConfigValidationError, Model
|
||||||
from thinc.config import Promise
|
from thinc.config import Promise
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import inspect
|
import inspect
|
||||||
|
import re
|
||||||
|
|
||||||
from .attrs import NAMES
|
from .attrs import NAMES
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
|
@ -198,13 +199,18 @@ class TokenPatternNumber(BaseModel):
|
||||||
return v
|
return v
|
||||||
|
|
||||||
|
|
||||||
class TokenPatternOperator(str, Enum):
|
class TokenPatternOperatorSimple(str, Enum):
|
||||||
plus: StrictStr = StrictStr("+")
|
plus: StrictStr = StrictStr("+")
|
||||||
start: StrictStr = StrictStr("*")
|
star: StrictStr = StrictStr("*")
|
||||||
question: StrictStr = StrictStr("?")
|
question: StrictStr = StrictStr("?")
|
||||||
exclamation: StrictStr = StrictStr("!")
|
exclamation: StrictStr = StrictStr("!")
|
||||||
|
|
||||||
|
|
||||||
|
class TokenPatternOperatorMinMax(ConstrainedStr):
|
||||||
|
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
||||||
|
|
||||||
|
|
||||||
|
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
||||||
StringValue = Union[TokenPatternString, StrictStr]
|
StringValue = Union[TokenPatternString, StrictStr]
|
||||||
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||||
UnderscoreValue = Union[
|
UnderscoreValue = Union[
|
||||||
|
|
|
@ -680,3 +680,38 @@ def test_matcher_ent_iob_key(en_vocab):
|
||||||
assert matches[0] == "Maria"
|
assert matches[0] == "Maria"
|
||||||
assert matches[1] == "Maria Esperanza"
|
assert matches[1] == "Maria Esperanza"
|
||||||
assert matches[2] == "Esperanza"
|
assert matches[2] == "Esperanza"
|
||||||
|
|
||||||
|
|
||||||
|
def test_matcher_min_max_operator(en_vocab):
|
||||||
|
# Exactly n matches {n}
|
||||||
|
doc = Doc(
|
||||||
|
en_vocab, words=["foo", "bar", "foo", "foo", "bar",
|
||||||
|
"foo", "foo", "foo", "bar", "bar"]
|
||||||
|
)
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": "foo", "OP": "{3}"}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
|
||||||
|
matches1 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
|
assert len(matches1) == 1
|
||||||
|
|
||||||
|
# At least n matches {n,}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": "foo", "OP": "{2,}"}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
matches2 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
|
assert len(matches2) == 4
|
||||||
|
|
||||||
|
# At most m matches {,m}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": "foo", "OP": "{,2}"}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
matches3 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
|
assert len(matches3) == 9
|
||||||
|
|
||||||
|
# At least n matches and most m matches {n,m}
|
||||||
|
matcher = Matcher(en_vocab)
|
||||||
|
pattern = [{"ORTH": "foo", "OP": "{2,3}"}]
|
||||||
|
matcher.add("TEST", [pattern])
|
||||||
|
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
|
assert len(matches4) == 4
|
||||||
|
|
|
@ -699,6 +699,10 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
||||||
("aaaa", "a a a a a?", [0, 1, 2, 3]),
|
("aaaa", "a a a a a?", [0, 1, 2, 3]),
|
||||||
("aaab", "a+ a b", [0, 0, 1, 2]),
|
("aaab", "a+ a b", [0, 0, 1, 2]),
|
||||||
("aaab", "a+ a+ b", [0, 0, 1, 2]),
|
("aaab", "a+ a+ b", [0, 0, 1, 2]),
|
||||||
|
("aaab", "a{2,} b", [0, 0, 0, 1]),
|
||||||
|
("aaab", "a{,3} b", [0, 0, 0, 1]),
|
||||||
|
("aaab", "a{2} b", [0, 0, 1]),
|
||||||
|
("aaab", "a{2,3} b", [0, 0, 0, 1]),
|
||||||
]
|
]
|
||||||
for string, pattern_str, result in cases:
|
for string, pattern_str, result in cases:
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
@ -711,6 +715,8 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
||||||
pattern.append({"ORTH": part[0], "OP": "*"})
|
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||||
elif part.endswith("?"):
|
elif part.endswith("?"):
|
||||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||||
|
elif part.endswith("}"):
|
||||||
|
pattern.append({"ORTH": part[0], "OP": part[1:]})
|
||||||
else:
|
else:
|
||||||
pattern.append({"ORTH": part})
|
pattern.append({"ORTH": part})
|
||||||
matcher.add("PATTERN", [pattern], greedy="LONGEST")
|
matcher.add("PATTERN", [pattern], greedy="LONGEST")
|
||||||
|
@ -722,7 +728,7 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
||||||
assert expected == result, (string, pattern_str, s, e, n_matches)
|
assert expected == result, (string, pattern_str, s, e, n_matches)
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_with_alignments_nongreedy(en_vocab):
|
def test_matcher_with_alignments_non_greedy(en_vocab):
|
||||||
cases = [
|
cases = [
|
||||||
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
|
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
|
||||||
(1, "baab", "b a* b", [[0, 1, 1, 2]]),
|
(1, "baab", "b a* b", [[0, 1, 1, 2]]),
|
||||||
|
@ -752,6 +758,10 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
||||||
(15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
|
(15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
|
||||||
(16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
|
(16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||||
(17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
|
(17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||||
|
(18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||||
|
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
|
||||||
|
(20, "aaab", "a{2} b", [[0, 0, 1]]),
|
||||||
|
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||||
]
|
]
|
||||||
for case_id, string, pattern_str, results in cases:
|
for case_id, string, pattern_str, results in cases:
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
@ -764,6 +774,8 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
||||||
pattern.append({"ORTH": part[0], "OP": "*"})
|
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||||
elif part.endswith("?"):
|
elif part.endswith("?"):
|
||||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||||
|
elif part.endswith("}"):
|
||||||
|
pattern.append({"ORTH": part[0], "OP": part[1:]})
|
||||||
else:
|
else:
|
||||||
pattern.append({"ORTH": part})
|
pattern.append({"ORTH": part})
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,14 @@ TEST_PATTERNS = [
|
||||||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||||
([{"ENT_IOB": "foo"}], 1, 1),
|
([{"ENT_IOB": "foo"}], 1, 1),
|
||||||
([1, 2, 3], 3, 1),
|
([1, 2, 3], 3, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{,}"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{a}"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{,a}"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1),
|
||||||
|
([{"TEXT": "foo", "OP": "{-2}"}], 1, 1),
|
||||||
# Bad patterns flagged outside of Matcher
|
# Bad patterns flagged outside of Matcher
|
||||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||||
# Bad patterns not flagged with minimal checks
|
# Bad patterns not flagged with minimal checks
|
||||||
|
@ -38,6 +46,7 @@ TEST_PATTERNS = [
|
||||||
([{"SENT_START": True}], 0, 0),
|
([{"SENT_START": True}], 0, 0),
|
||||||
([{"ENT_ID": "STRING"}], 0, 0),
|
([{"ENT_ID": "STRING"}], 0, 0),
|
||||||
([{"ENT_KB_ID": "STRING"}], 0, 0),
|
([{"ENT_KB_ID": "STRING"}], 0, 0),
|
||||||
|
([{"TEXT": "ha", "OP": "{3}"}], 0, 0),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -59,15 +59,20 @@ matched:
|
||||||
> [
|
> [
|
||||||
> {"POS": "ADJ", "OP": "*"},
|
> {"POS": "ADJ", "OP": "*"},
|
||||||
> {"POS": "NOUN", "OP": "+"}
|
> {"POS": "NOUN", "OP": "+"}
|
||||||
|
> {"POS": "PROPN", "OP": "{2}"}
|
||||||
> ]
|
> ]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| OP | Description |
|
| OP | Description |
|
||||||
| --- | ---------------------------------------------------------------- |
|
|---------|------------------------------------------------------------------------|
|
||||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||||
| `+` | Require the pattern to match 1 or more times. |
|
| `+` | Require the pattern to match 1 or more times. |
|
||||||
| `*` | Allow the pattern to match 0 or more times. |
|
| `*` | Allow the pattern to match 0 or more times. |
|
||||||
|
| `{n}` | Require the pattern to match exactly _n_ times. |
|
||||||
|
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
|
||||||
|
| `{n,}` | Require the pattern to match at least _n_ times. |
|
||||||
|
| `{,m}` | Require the pattern to match at most _m_ times. |
|
||||||
|
|
||||||
Token patterns can also map to a **dictionary of properties** instead of a
|
Token patterns can also map to a **dictionary of properties** instead of a
|
||||||
single value to indicate whether the expected value is a member of a list or how
|
single value to indicate whether the expected value is a member of a list or how
|
||||||
|
|
|
@ -374,12 +374,16 @@ punctuation marks, or specify optional tokens. Note that there are no nested or
|
||||||
scoped quantifiers – instead, you can build those behaviors with `on_match`
|
scoped quantifiers – instead, you can build those behaviors with `on_match`
|
||||||
callbacks.
|
callbacks.
|
||||||
|
|
||||||
| OP | Description |
|
| OP | Description |
|
||||||
| --- | ---------------------------------------------------------------- |
|
|---------|------------------------------------------------------------------------|
|
||||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||||
| `+` | Require the pattern to match 1 or more times. |
|
| `+` | Require the pattern to match 1 or more times. |
|
||||||
| `*` | Allow the pattern to match zero or more times. |
|
| `*` | Allow the pattern to match zero or more times. |
|
||||||
|
| `{n}` | Require the pattern to match exactly _n_ times. |
|
||||||
|
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
|
||||||
|
| `{n,}` | Require the pattern to match at least _n_ times. |
|
||||||
|
| `{,m}` | Require the pattern to match at most _m_ times. |
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user