Merge pull request #3 from explosion/master

Update
2025-08-08 14:14:57 +03:00 · 2022-06-30 23:05:20 +08:00 · 2022-06-30 23:05:20 +08:00 · 9accdbdbad
commit 9accdbdbad
parent e266e07d2f eaf66e7431
12 changed files with 229 additions and 43 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -103,6 +103,10 @@ cuda114 =
    cupy-cuda114>=5.0.0b4,<11.0.0
 cuda115 =
    cupy-cuda115>=5.0.0b4,<11.0.0
 cuda116 =
    cupy-cuda116>=5.0.0b4,<11.0.0
 cuda117 =
    cupy-cuda117>=5.0.0b4,<11.0.0
 apple =
    thinc-apple-ops>=0.1.0.dev0,<1.0.0
 # Language tokenizers with external dependencies
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -209,6 +209,9 @@ class Warnings(metaclass=ErrorsWithCodes):
            "Only the last span group will be loaded under "
            "Doc.spans['{group_name}']. Skipping span group with values: "
            "{group_values}")
    W121 = ("Attempting to trace non-existent method '{method}' in pipe '{pipe}'")
    W122 = ("Couldn't trace method '{method}' in pipe '{pipe}'. This can happen if the pipe class "
            "is a Cython extension type.")
 class Errors(metaclass=ErrorsWithCodes):
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -90,6 +90,10 @@ cdef class Matcher:
        '?':      Make the pattern optional, by allowing it to match 0 or 1 times.
        '+':      Require the pattern to match 1 or more times.
        '*':      Allow the pattern to zero or more times.
        '{n}':    Require the pattern to match exactly _n_ times.
        '{n,m}':  Require the pattern to match at least _n_ but not more than _m_ times.
        '{n,}':   Require the pattern to match at least _n_ times.
        '{,m}':   Require the pattern to match at most _m_ times.
        The + and * operators return all possible matches (not just the greedy
        ones). However, the "greedy" argument can filter the final matches
@ -1004,8 +1008,29 @@ def _get_operators(spec):
        return (ONE,)
    elif spec["OP"] in lookup:
        return lookup[spec["OP"]]
    #Min_max {n,m}
    elif spec["OP"].startswith("{") and spec["OP"].endswith("}"):
        # {n}  --> {n,n}  exactly n                 ONE,(n)
        # {n,m}--> {n,m}  min of n, max of m        ONE,(n),ZERO_ONE,(m)
        # {,m} --> {0,m}  min of zero, max of m     ZERO_ONE,(m)
        # {n,} --> {n,∞} min of n, max of inf       ONE,(n),ZERO_PLUS
        min_max = spec["OP"][1:-1]
        min_max = min_max if "," in min_max else f"{min_max},{min_max}"
        n, m = min_max.split(",")
        #1. Either n or m is a blank string and the other is numeric -->isdigit
        #2. Both are numeric and n <= m
        if (not n.isdecimal() and not m.isdecimal()) or (n.isdecimal() and m.isdecimal() and int(n) > int(m)):
            keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
            raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
        # if n is empty string, zero would be used
        head = tuple(ONE for __ in range(int(n or 0)))
        tail = tuple(ZERO_ONE for __ in range(int(m) - int(n or 0))) if m else (ZERO_PLUS,)
        return head + tail
    else:
-        keys = ", ".join(lookup.keys())
+        keys = ", ".join(lookup.keys()) + ", {n}, {n,m}, {n,}, {,m} where n and m are integers and n <= m "
        raise ValueError(Errors.E011.format(op=spec["OP"], opts=keys))
--- a/spacy/ml/callbacks.py
+++ b/spacy/ml/callbacks.py
@ -1,9 +1,14 @@
-from functools import partial
+from typing import Type, Callable, Dict, TYPE_CHECKING, List, Optional, Set
-from typing import Type, Callable, TYPE_CHECKING
+import functools
 import inspect
 import types
 import warnings
 from thinc.layers import with_nvtx_range
 from thinc.model import Model, wrap_model_recursive
 from thinc.util import use_nvtx_range
 from ..errors import Warnings
 from ..util import registry
 if TYPE_CHECKING:
@ -11,29 +16,106 @@ if TYPE_CHECKING:
    from ..language import Language  # noqa: F401
-@registry.callbacks("spacy.models_with_nvtx_range.v1")
+DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
-def create_models_with_nvtx_range(
+    "pipe",
-    forward_color: int = -1, backprop_color: int = -1
+    "predict",
-) -> Callable[["Language"], "Language"]:
+    "set_annotations",
-    def models_with_nvtx_range(nlp):
+    "update",
    "rehearse",
    "get_loss",
    "initialize",
    "begin_update",
    "finish_update",
    "update",
 ]
 def models_with_nvtx_range(nlp, forward_color: int, backprop_color: int):
    pipes = [
        pipe
        for _, pipe in nlp.components
        if hasattr(pipe, "is_trainable") and pipe.is_trainable
    ]
-        # We need process all models jointly to avoid wrapping callbacks twice.
+    seen_models: Set[int] = set()
-        models = Model(
+    for pipe in pipes:
-            "wrap_with_nvtx_range",
+        for node in pipe.model.walk():
-            forward=lambda model, X, is_train: ...,
+            if id(node) in seen_models:
-            layers=[pipe.model for pipe in pipes],
+                continue
-        )
+            seen_models.add(id(node))
        for node in models.walk():
            with_nvtx_range(
                node, forward_color=forward_color, backprop_color=backprop_color
            )
    return nlp
-    return models_with_nvtx_range
+
@registry.callbacks("spacy.models_with_nvtx_range.v1")
 def create_models_with_nvtx_range(
    forward_color: int = -1, backprop_color: int = -1
 ) -> Callable[["Language"], "Language"]:
    return functools.partial(
        models_with_nvtx_range,
        forward_color=forward_color,
        backprop_color=backprop_color,
    )
 def nvtx_range_wrapper_for_pipe_method(self, func, *args, **kwargs):
    if isinstance(func, functools.partial):
        return func(*args, **kwargs)
    else:
        with use_nvtx_range(f"{self.name} {func.__name__}"):
            return func(*args, **kwargs)
 def pipes_with_nvtx_range(
    nlp, additional_pipe_functions: Optional[Dict[str, List[str]]]
 ):
    for _, pipe in nlp.components:
        if additional_pipe_functions:
            extra_funcs = additional_pipe_functions.get(pipe.name, [])
        else:
            extra_funcs = []
        for name in DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS + extra_funcs:
            func = getattr(pipe, name, None)
            if func is None:
                if name in extra_funcs:
                    warnings.warn(Warnings.W121.format(method=name, pipe=pipe.name))
                continue
            wrapped_func = functools.partial(
                types.MethodType(nvtx_range_wrapper_for_pipe_method, pipe), func
            )
            # Try to preserve the original function signature.
            try:
                wrapped_func.__signature__ = inspect.signature(func)  # type: ignore
            except:
                pass
            try:
                setattr(
                    pipe,
                    name,
                    wrapped_func,
                )
            except AttributeError:
                warnings.warn(Warnings.W122.format(method=name, pipe=pipe.name))
    return nlp
@registry.callbacks("spacy.models_and_pipes_with_nvtx_range.v1")
 def create_models_and_pipes_with_nvtx_range(
    forward_color: int = -1,
    backprop_color: int = -1,
    additional_pipe_functions: Optional[Dict[str, List[str]]] = None,
 ) -> Callable[["Language"], "Language"]:
    def inner(nlp):
        nlp = models_with_nvtx_range(nlp, forward_color, backprop_color)
        nlp = pipes_with_nvtx_range(nlp, additional_pipe_functions)
        return nlp
    return inner
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -3,12 +3,13 @@ from typing import Iterable, TypeVar, TYPE_CHECKING
 from .compat import Literal
 from enum import Enum
 from pydantic import BaseModel, Field, ValidationError, validator, create_model
-from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool
+from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
 from pydantic.main import ModelMetaclass
 from thinc.api import Optimizer, ConfigValidationError, Model
 from thinc.config import Promise
 from collections import defaultdict
 import inspect
 import re
 from .attrs import NAMES
 from .lookups import Lookups
@ -198,13 +199,18 @@ class TokenPatternNumber(BaseModel):
        return v
-class TokenPatternOperator(str, Enum):
+class TokenPatternOperatorSimple(str, Enum):
    plus: StrictStr = StrictStr("+")
-    start: StrictStr = StrictStr("*")
+    star: StrictStr = StrictStr("*")
    question: StrictStr = StrictStr("?")
    exclamation: StrictStr = StrictStr("!")
 class TokenPatternOperatorMinMax(ConstrainedStr):
    regex = re.compile("^({\d+}|{\d+,\d*}|{\d*,\d+})$")
 TokenPatternOperator = Union[TokenPatternOperatorSimple, TokenPatternOperatorMinMax]
 StringValue = Union[TokenPatternString, StrictStr]
 NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
 UnderscoreValue = Union[
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -680,3 +680,38 @@ def test_matcher_ent_iob_key(en_vocab):
    assert matches[0] == "Maria"
    assert matches[1] == "Maria Esperanza"
    assert matches[2] == "Esperanza"
 def test_matcher_min_max_operator(en_vocab):
    # Exactly n matches {n}
    doc = Doc(
        en_vocab, words=["foo", "bar", "foo", "foo", "bar",
                         "foo", "foo", "foo", "bar", "bar"]
    )
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "foo", "OP": "{3}"}]
    matcher.add("TEST", [pattern])
    matches1 = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches1) == 1
    # At least n matches {n,}
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "foo", "OP": "{2,}"}]
    matcher.add("TEST", [pattern])
    matches2 = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches2) == 4
    # At most m matches {,m}
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "foo", "OP": "{,2}"}]
    matcher.add("TEST", [pattern])
    matches3 = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches3) == 9
    # At least n matches and most m matches {n,m}
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "foo", "OP": "{2,3}"}]
    matcher.add("TEST", [pattern])
    matches4 = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches4) == 4
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -699,6 +699,10 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
        ("aaaa", "a a a a a?", [0, 1, 2, 3]),
        ("aaab", "a+ a b", [0, 0, 1, 2]),
        ("aaab", "a+ a+ b", [0, 0, 1, 2]),
        ("aaab", "a{2,} b", [0, 0, 0, 1]),
        ("aaab", "a{,3} b", [0, 0, 0, 1]),
        ("aaab", "a{2} b", [0, 0, 1]),
        ("aaab", "a{2,3} b", [0, 0, 0, 1]),
    ]
    for string, pattern_str, result in cases:
        matcher = Matcher(en_vocab)
@ -711,6 +715,8 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
                pattern.append({"ORTH": part[0], "OP": "*"})
            elif part.endswith("?"):
                pattern.append({"ORTH": part[0], "OP": "?"})
            elif part.endswith("}"):
                pattern.append({"ORTH": part[0], "OP": part[1:]})
            else:
                pattern.append({"ORTH": part})
        matcher.add("PATTERN", [pattern], greedy="LONGEST")
@ -722,7 +728,7 @@ def test_matcher_with_alignments_greedy_longest(en_vocab):
        assert expected == result, (string, pattern_str, s, e, n_matches)
-def test_matcher_with_alignments_nongreedy(en_vocab):
+def test_matcher_with_alignments_non_greedy(en_vocab):
    cases = [
        (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
        (1, "baab", "b a* b", [[0, 1, 1, 2]]),
@ -752,6 +758,10 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
        (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
        (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
        (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
        (18, "aaab", "a{2,} b", [[0, 0, 1], [0, 0, 0, 1]]),
        (19, "aaab", "a{3} b", [[0, 0, 0, 1]]),
        (20, "aaab", "a{2} b", [[0, 0, 1]]),
        (21, "aaab", "a{2,3} b", [[0, 0, 1], [0, 0, 0, 1]]),
    ]
    for case_id, string, pattern_str, results in cases:
        matcher = Matcher(en_vocab)
@ -764,6 +774,8 @@ def test_matcher_with_alignments_nongreedy(en_vocab):
                pattern.append({"ORTH": part[0], "OP": "*"})
            elif part.endswith("?"):
                pattern.append({"ORTH": part[0], "OP": "?"})
            elif part.endswith("}"):
                pattern.append({"ORTH": part[0], "OP": part[1:]})
            else:
                pattern.append({"ORTH": part})
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@ -14,6 +14,14 @@ TEST_PATTERNS = [
    ('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
    ([{"ENT_IOB": "foo"}], 1, 1),
    ([1, 2, 3], 3, 1),
    ([{"TEXT": "foo", "OP": "{,}"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{,4}4"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{a,3}"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{a}"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{,a}"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{1,2,3}"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{1, 3}"}], 1, 1),
    ([{"TEXT": "foo", "OP": "{-2}"}], 1, 1),
    # Bad patterns flagged outside of Matcher
    ([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 2, 0),  # prev: (1, 0)
    # Bad patterns not flagged with minimal checks
@ -38,6 +46,7 @@ TEST_PATTERNS = [
    ([{"SENT_START": True}], 0, 0),
    ([{"ENT_ID": "STRING"}], 0, 0),
    ([{"ENT_KB_ID": "STRING"}], 0, 0),
    ([{"TEXT": "ha", "OP": "{3}"}], 0, 0),
 ]
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -60,12 +60,11 @@ def test_readers():
    assert isinstance(extra_corpus, Callable)
 # TODO: enable IMDB test once Stanford servers are back up and running
@pytest.mark.slow
@pytest.mark.parametrize(
    "reader,additional_config",
    [
-        #        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
+        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
        ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
    ],
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -59,15 +59,20 @@ matched:
 > [
 >   {"POS": "ADJ", "OP": "*"},
 >   {"POS": "NOUN", "OP": "+"}
 >   {"POS": "PROPN", "OP": "{2}"}
 > ]
 > ```
 | OP      | Description                                                            |
-| --- | ---------------------------------------------------------------- |
+|---------|------------------------------------------------------------------------|
 | `!`     | Negate the pattern, by requiring it to match exactly 0 times.          |
 | `?`     | Make the pattern optional, by allowing it to match 0 or 1 times.       |
 | `+`     | Require the pattern to match 1 or more times.                          |
 | `*`     | Allow the pattern to match 0 or more times.                            |
 | `{n}`   | Require the pattern to match exactly _n_ times.                        |
 | `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
 | `{n,}`  | Require the pattern to match at least _n_ times.                       |
 | `{,m}`  | Require the pattern to match at most _m_ times.                        |
 Token patterns can also map to a **dictionary of properties** instead of a
 single value to indicate whether the expected value is a member of a list or how
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -375,11 +375,15 @@ scoped quantifiers – instead, you can build those behaviors with `on_match`
 callbacks.
 | OP      | Description                                                            |
-| --- | ---------------------------------------------------------------- |
+|---------|------------------------------------------------------------------------|
 | `!`     | Negate the pattern, by requiring it to match exactly 0 times.          |
 | `?`     | Make the pattern optional, by allowing it to match 0 or 1 times.       |
 | `+`     | Require the pattern to match 1 or more times.                          |
 | `*`     | Allow the pattern to match zero or more times.                         |
 | `{n}`   | Require the pattern to match exactly _n_ times.                        |
 | `{n,m}` | Require the pattern to match at least _n_ but not more than _m_ times. |
 | `{n,}`  | Require the pattern to match at least _n_ times.                       |
 | `{,m}`  | Require the pattern to match at most _m_ times.                        |
 > #### Example
 >
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -24,6 +24,8 @@ const CUDA = {
    '11.3': 'cuda113',
    '11.4': 'cuda114',
    '11.5': 'cuda115',
    '11.6': 'cuda116',
    '11.7': 'cuda117',
 }
 const LANG_EXTRAS = ['ja'] // only for languages with models