mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Addition of min_max quantifier in matcher {n,m} (#10981)
* Min_max_operators 1. Modified API and Usage for spaCy website to include min_max operator 2. Modified matcher.pyx to include min_max function {n,m} and its variants 3. Modified schemas.py to include min_max validation error 4. Added test cases to test_matcher_api.py, test_matcher_logic.py and test_pattern_validation.py * attempt to fix mypy/pydantic compat issue * formatting * Update spacy/tests/matcher/test_pattern_validation.py Co-authored-by: Source-Shen <82353723+Source-Shen@users.noreply.github.com> Co-authored-by: svlandeg <svlandeg@github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
4581a4f53f
commit
be00db6645
|
@ -90,6 +90,10 @@ cdef class Matcher:
|
|||
'?': Make the pattern optional, by allowing it to match 0 or 1 times.
|
||||
'+': Require the pattern to match 1 or more times.
|
||||
'*': Allow the pattern to zero or more times.
|
||||
'{n}': Require the pattern to match exactly _n_ times.
|
||||
'{n,m}': Require the pattern to match at least _n_ but not more than _m_ times.
|
||||
'{n,}': Require the pattern to match at least _n_ times.
|
||||
'{,m}': Require the pattern to match at most _m_ times.
|
||||
|
||||
The + and * operators return all possible matches (not just the greedy
|
||||
ones). However, the "greedy" argument can filter the final matches
|
||||
|
@ -1004,8 +1008,29 @@ def _get_operators(spec):
|
|||
return (ONE,)
|
||||
elif spec["OP"] in lookup:
|
||||
return lookup[spec["OP"]]
|
||||
#Min_max {n,m}
|
||||
elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
|
||||
# {n} --> {n,n} exactly n ONE,(n)
|
||||
# {n,m}--> {n,m} min of n, max of m ONE,(n),ZERO_ONE,(m)
|
||||
# {,m} --> {0,m} min of zero, max of m ZERO_ONE,(m)
|
||||
# {n,} --> {n,∞} min of n, max of inf ONE,(n),ZERO_PLUS
|
||||
|
||||
min_max = spec["OP"][1:-1]
|
||||
min_max = min_max if "," in min_max else f"{min_max},{min_max}"
|
||||
n, m = min_max.split(",")
|
||||
|
||||
#1. Either n or m is a blank string and the other is numeric -->isdigit
|
||||
#2. Both are numeric and n <= m
|
||||
if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
|
||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||
|
||||
# if n is empty string, zero would be used
|
||||
head = tuple(ONE for __ in range(int(n or 0)))
|
||||
tail = tuple(ZERO_ONE for __ in range(int(m) - int(n or 0))) if m else (ZERO_PLUS,)
|
||||
return head + tail
|
||||
else:
|
||||
keys = ", ".join(lookup.keys())
|
||||
keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
|
||||
raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
|
||||
|
||||
|
||||
|
|
|
@ -3,12 +3,13 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
|
|||
from .compat import Literal
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field, ValidationError, validator, create_model
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
|
||||
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
|
||||
from pydantic.main import ModelMetaclass
|
||||
from thinc.api import Optimizer, ConfigValidationError, Model
|
||||
from thinc.config import Promise
|
||||
from collections import defaultdict
|
||||
import inspect
|
||||
import re
|
||||
|
||||
from .attrs import NAMES
|
||||
from .lookups import Lookups
|
||||
|
@ -198,13 +199,18 @@ class TokenPatternNumber(BaseModel):
|
|||
return v
|
||||
|
||||
|
||||
class TokenPatternOperator(str, Enum):
|
||||
class TokenPatternOperatorSimple(str, Enum):
|
||||
plus: StrictStr = StrictStr("+")
|
||||
start: StrictStr = StrictStr("*")
|
||||
star: StrictStr = StrictStr("*")
|
||||
question: StrictStr = StrictStr("?")
|
||||
exclamation: StrictStr = StrictStr("!")
|
||||
|
||||
|
||||
class TokenPatternOperatorMinMax(ConstrainedStr):
|
||||
regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
|
||||
|
||||
|
||||
TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
|
||||
StringValue = Union[TokenPatternString, StrictStr]
|
||||
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||
UnderscoreValue = Union[
|
||||
|
|
|
@ -680,3 +680,38 @@ def test_matcher_ent_iob_key(en_vocab):
|
|||
assert matches[0] == "Maria"
|
||||
assert matches[1] == "Maria Esperanza"
|
||||
assert matches[2] == "Esperanza"
|
||||
|
||||
|
||||
def test_matcher_min_max_operator(en_vocab):
|
||||
# Exactly n matches {n}
|
||||
doc = Doc(
|
||||
en_vocab, words=["foo", "bar", "foo", "foo", "bar",
|
||||
"foo", "foo", "foo", "bar", "bar"]
|
||||
)
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{3}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
|
||||
matches1 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches1) == 1
|
||||
|
||||
# At least n matches {n,}
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{2,}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
matches2 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches2) == 4
|
||||
|
||||
# At most m matches {,m}
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{,2}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
matches3 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches3) == 9
|
||||
|
||||
# At least n matches and most m matches {n,m}
|
||||
matcher = Matcher(en_vocab)
|
||||
pattern = [{"ORTH": "foo", "OP": "{2,3}"}]
|
||||
matcher.add("TEST", [pattern])
|
||||
matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||
assert len(matches4) == 4
|
||||
|
|
|
@ -699,6 +699,10 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
("aaaa", "a a a a a?", [0, 1, 2, 3]),
|
||||
("aaab", "a+ a b", [0, 0, 1, 2]),
|
||||
("aaab", "a+ a+ b", [0, 0, 1, 2]),
|
||||
("aaab", "a{2,} b", [0, 0, 0, 1]),
|
||||
("aaab", "a{,3} b", [0, 0, 0, 1]),
|
||||
("aaab", "a{2} b", [0, 0, 1]),
|
||||
("aaab", "a{2,3} b", [0, 0, 0, 1]),
|
||||
]
|
||||
for string, pattern_str, result in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -711,6 +715,8 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
elif part.endswith("?"):
|
||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||
elif part.endswith("}"):
|
||||
pattern.append({"ORTH": part[0], "OP": part[1:]})
|
||||
else:
|
||||
pattern.append({"ORTH": part})
|
||||
matcher.add("PATTERN", [pattern], greedy="LONGEST")
|
||||
|
@ -722,7 +728,7 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
|
|||
assert expected == result, (string, pattern_str, s, e, n_matches)
|
||||
|
||||
|
||||
def test_matcher_with_alignments_nongreedy(en_vocab):
|
||||
def test_matcher_with_alignments_non_greedy(en_vocab):
|
||||
cases = [
|
||||
(0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
|
||||
(1, "baab", "b a* b", [[0, 1, 1, 2]]),
|
||||
|
@ -752,6 +758,10 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
|||
(15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
|
||||
(16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||
(17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
|
||||
(18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||
(19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
|
||||
(20, "aaab", "a{2} b", [[0, 0, 1]]),
|
||||
(21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
|
||||
]
|
||||
for case_id, string, pattern_str, results in cases:
|
||||
matcher = Matcher(en_vocab)
|
||||
|
@ -764,6 +774,8 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
|
|||
pattern.append({"ORTH": part[0], "OP": "*"})
|
||||
elif part.endswith("?"):
|
||||
pattern.append({"ORTH": part[0], "OP": "?"})
|
||||
elif part.endswith("}"):
|
||||
pattern.append({"ORTH": part[0], "OP": part[1:]})
|
||||
else:
|
||||
pattern.append({"ORTH": part})
|
||||
|
||||
|
|
|
@ -14,6 +14,14 @@ TEST_PATTERNS = [
|
|||
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
|
||||
([{"ENT_IOB": "foo"}], 1, 1),
|
||||
([1, 2, 3], 3, 1),
|
||||
([{"TEXT": "foo", "OP": "{,}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{a}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{,a}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1),
|
||||
([{"TEXT": "foo", "OP": "{-2}"}], 1, 1),
|
||||
# Bad patterns flagged outside of Matcher
|
||||
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0), # prev: (1, 0)
|
||||
# Bad patterns not flagged with minimal checks
|
||||
|
@ -38,6 +46,7 @@ TEST_PATTERNS = [
|
|||
([{"SENT_START": True}], 0, 0),
|
||||
([{"ENT_ID": "STRING"}], 0, 0),
|
||||
([{"ENT_KB_ID": "STRING"}], 0, 0),
|
||||
([{"TEXT": "ha", "OP": "{3}"}], 0, 0),
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -59,15 +59,20 @@ matched:
|
|||
> [
|
||||
> {"POS": "ADJ", "OP": "*"},
|
||||
> {"POS": "NOUN", "OP": "+"}
|
||||
> {"POS": "PROPN", "OP": "{2}"}
|
||||
> ]
|
||||
> ```
|
||||
|
||||
| OP | Description |
|
||||
| --- | ---------------------------------------------------------------- |
|
||||
|---------|------------------------------------------------------------------------|
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match 0 or more times. |
|
||||
| `{n}` | Require the pattern to match exactly _n_ times. |
|
||||
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
|
||||
| `{n,}` | Require the pattern to match at least _n_ times. |
|
||||
| `{,m}` | Require the pattern to match at most _m_ times. |
|
||||
|
||||
Token patterns can also map to a **dictionary of properties** instead of a
|
||||
single value to indicate whether the expected value is a member of a list or how
|
||||
|
|
|
@ -375,11 +375,15 @@ scoped quantifiers – instead, you can build those behaviors with `on_match`
|
|||
callbacks.
|
||||
|
||||
| OP | Description |
|
||||
| --- | ---------------------------------------------------------------- |
|
||||
|---------|------------------------------------------------------------------------|
|
||||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match zero or more times. |
|
||||
| `{n}` | Require the pattern to match exactly _n_ times. |
|
||||
| `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
|
||||
| `{n,}` | Require the pattern to match at least _n_ times. |
|
||||
| `{,m}` | Require the pattern to match at most _m_ times. |
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
Loading…
Reference in New Issue
Block a user