Update v2.2.x for bugfix release (#6384)

* Fix on_match callback and remove empty patterns (#6312) For the `DependencyMatcher`: * Fix on_match callback so that it is called once per matched pattern * Fix results so that patterns with empty match lists are not returned * Add --prefer-binary for python 3.5 * Add version pins for pyrsistent * Use backwards-compatible super() * Try to fix tests on Travis (2.7) * Fix naming conflict and formatting * Update pkuseg version in Chinese tokenizer warnings * Some changes for Armenian (#5616) * Fixing numericals * We need a Armenian question sign to make the sentence a question * Update lex_attrs.py (#5608) * Fix compat * Update Armenian from v2.3.x Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Karen Hambardzumyan <mahnerak@gmail.com> Co-authored-by: Marat M. Yavrumyan <myavrum@ysu.am>
2025-07-15 10:42:34 +03:00 · 2020-11-14 09:20:42 +01:00 · 2020-11-14 09:20:42 +01:00 · ada4fc0f09
commit ada4fc0f09
parent a41e28ceba
15 changed files with 209 additions and 383 deletions
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -80,8 +80,8 @@ jobs:
      architecture: 'x64'

  - script: |
-      python -m pip install -U setuptools
-      pip install -r requirements.txt
+      python -m pip install -U pip setuptools
+      pip install -r requirements.txt --prefer-binary
    displayName: 'Install dependencies'

  - script: |
@ -96,7 +96,7 @@ jobs:

  - bash: |
      SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-      pip install dist/$SDIST
+      pip install dist/$SDIST --prefer-binary
    displayName: 'Install from sdist'

  - script: python -m pytest --pyargs spacy
--- a/requirements.txt
+++ b/requirements.txt
@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0
 pathlib==1.0.1; python_version < "3.4"
 tqdm>=4.38.0,<5.0.0
 # Optional dependencies
+pyrsistent<0.17.0
 jsonschema>=2.6.0,<3.1.0
 # Development dependencies
 cython>=0.25
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -7,8 +7,8 @@ def add_codes(err_cls):

    class ErrorsWithCodes(err_cls):
        def __getattribute__(self, code):
-            msg = super().__getattribute__(code)
-            if code.startswith('__'):  # python system attributes like __class__
+            msg = super(ErrorsWithCodes, self).__getattribute__(code)
+            if code.startswith("__"):  # python system attributes like __class__
                return msg
            else:
                return "[{code}] {msg}".format(code=code, msg=msg)
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,11 +1,12 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP

-
 from ...attrs import LANG
 from ...language import Language
-from ...tokens import Doc


 class ArmenianDefaults(Language.Defaults):
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -1,6 +1,6 @@
+# coding: utf8
 from __future__ import unicode_literals

-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models.
 sentences = [
    "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
    "Ո՞վ է Ֆրանսիայի նախագահը։",
-    "Որն է Միացյալ Նահանգների մայրաքաղաքը։",
+    "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։",
    "Ե՞րբ է ծնվել Բարաք Օբաման։",
 ]
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@ -1,11 +1,12 @@
+# coding: utf8
 from __future__ import unicode_literals

 from ...attrs import LIKE_NUM


 _num_words = [
-    "զրօ",
-    "մէկ",
+    "զրո",
+    "մեկ",
    "երկու",
    "երեք",
    "չորս",
@ -17,20 +18,21 @@ _num_words = [
    "տասը",
    "տասնմեկ",
    "տասներկու",
-    "տասներեք",
-    "տասնչորս",
-    "տասնհինգ",
-    "տասնվեց",
-    "տասնյոթ",
-    "տասնութ",
-    "տասնինը",
-    "քսան" "երեսուն",
+    "տասներեք",
+    "տասնչորս",
+    "տասնհինգ",
+    "տասնվեց",
+    "տասնյոթ",
+    "տասնութ",
+    "տասնինը",
+    "քսան",
+    "երեսուն",
    "քառասուն",
    "հիսուն",
-    "վաթցսուն",
+    "վաթսուն",
    "յոթանասուն",
    "ութսուն",
-    "ինիսուն",
+    "իննսուն",
    "հարյուր",
    "հազար",
    "միլիոն",
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -1,6 +1,6 @@
+# coding: utf8
 from __future__ import unicode_literals

-
 STOP_WORDS = set(
    """
 նա
@ -105,6 +105,6 @@ STOP_WORDS = set(
 յուրաքանչյուր
 այս
 մեջ
-թ	
+թ
 """.split()
 )
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -14,7 +14,7 @@ class PolishLemmatizer(Lemmatizer):
    # lemmatization for nouns
    def __init__(self, lookups, *args, **kwargs):
        # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
-        super().__init__(lookups)
+        super(PolishLemmatizer, self).__init__(lookups)
        self.lemma_lookups = {}
        for tag in [
            "ADJ",
--- a/spacy/lang/zh/init.py
+++ b/spacy/lang/zh/init.py
@ -16,7 +16,7 @@ from .tag_map import TAG_MAP
 from ... import util


-_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
+_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"


 def try_jieba_import(use_jieba):
@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
            if reset:
                try:
                    import pkuseg
+
                    self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
                except ImportError:
                    if self.use_pkuseg:
@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
                        )
                        raise ImportError(msg)
            for word in words:
-                self.pkuseg_seg.preprocesser.insert(word.strip(), '')
+                self.pkuseg_seg.preprocesser.insert(word.strip(), "")

    def _get_config(self):
        config = OrderedDict(
@ -168,21 +169,16 @@ class ChineseTokenizer(DummyTokenizer):
        return util.to_bytes(serializers, [])

    def from_bytes(self, data, **kwargs):
-        pkuseg_features_b = b""
-        pkuseg_weights_b = b""
-        pkuseg_processors_data = None
+        pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None}

        def deserialize_pkuseg_features(b):
-            nonlocal pkuseg_features_b
-            pkuseg_features_b = b
+            pkuseg_data["features_b"] = b

        def deserialize_pkuseg_weights(b):
-            nonlocal pkuseg_weights_b
-            pkuseg_weights_b = b
+            pkuseg_data["weights_b"] = b

        def deserialize_pkuseg_processors(b):
-            nonlocal pkuseg_processors_data
-            pkuseg_processors_data = srsly.msgpack_loads(b)
+            pkuseg_data["processors_data"] = srsly.msgpack_loads(b)

        deserializers = OrderedDict(
            (
@ -194,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer):
        )
        util.from_bytes(data, deserializers, [])

-        if pkuseg_features_b and pkuseg_weights_b:
+        if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
            with tempfile.TemporaryDirectory() as tempdir:
                tempdir = Path(tempdir)
                with open(tempdir / "features.pkl", "wb") as fileh:
-                    fileh.write(pkuseg_features_b)
+                    fileh.write(pkuseg_data["features_b"])
                with open(tempdir / "weights.npz", "wb") as fileh:
-                    fileh.write(pkuseg_weights_b)
+                    fileh.write(pkuseg_data["weights_b"])
                try:
                    import pkuseg
                except ImportError:
@ -209,13 +205,9 @@ class ChineseTokenizer(DummyTokenizer):
                        + _PKUSEG_INSTALL_MSG
                    )
                self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
-            if pkuseg_processors_data:
-                (
-                    user_dict,
-                    do_process,
-                    common_words,
-                    other_words,
-                ) = pkuseg_processors_data
+            if pkuseg_data["processors_data"]:
+                processors_data = pkuseg_data["processors_data"]
+                (user_dict, do_process, common_words, other_words) = processors_data
                self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
                self.pkuseg_seg.postprocesser.do_process = do_process
                self.pkuseg_seg.postprocesser.common_words = set(common_words)
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -235,12 +235,12 @@ cdef class DependencyMatcher:

                matched_trees = []
                self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
-                matched_key_trees.append((key,matched_trees))
-
-            for i, (ent_id, nodes) in enumerate(matched_key_trees):
-                on_match = self._callbacks.get(ent_id)
-                if on_match is not None:
-                    on_match(self, doc, i, matched_key_trees)
+                if len(matched_trees) > 0:
+                    matched_key_trees.append((key,matched_trees))
+        for i, (ent_id, nodes) in enumerate(matched_key_trees):
+            on_match = self._callbacks.get(ent_id)
+            if on_match is not None:
+                on_match(self, doc, i, matched_key_trees)
        return matched_key_trees

    def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
--- a/spacy/tests/lang/hy/test_text.py
+++ b/spacy/tests/lang/hy/test_text.py
@ -1,3 +1,4 @@
+# coding: utf8
 from __future__ import unicode_literals

 import pytest
--- a/spacy/tests/lang/hy/test_tokenizer.py
+++ b/spacy/tests/lang/hy/test_tokenizer.py
@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals

 import pytest
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -7,6 +7,7 @@ from mock import Mock
 from spacy.matcher import Matcher, DependencyMatcher
 from spacy.tokens import Doc, Token
 from ..doc.test_underscore import clean_underscore  # noqa: F401
+from ..util import get_doc


@pytest.fixture
@ -301,22 +302,6 @@ def test_matcher_extension_set_membership(en_vocab):
    assert len(matches) == 0


-@pytest.fixture
-def text():
-    return "The quick brown fox jumped over the lazy fox"
-
-
-@pytest.fixture
-def heads():
-    return [3, 2, 1, 1, 0, -1, 2, 1, -3]
-
-
-@pytest.fixture
-def deps():
-    return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
-
-
-@pytest.fixture
 def dependency_matcher(en_vocab):
    def is_brown_yellow(text):
        return bool(re.compile(r"brown|yellow|over").match(text))
@ -359,24 +344,40 @@ def dependency_matcher(en_vocab):
        },
    ]

+    # pattern that doesn't match
+    pattern4 = [
+        {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "NOMATCH"}},
+        {
+            "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
+            "PATTERN": {"ORTH": "fox"},
+        },
+        {
+            "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
+            "PATTERN": {"ORTH": "brown"},
+        },
+    ]
+
    matcher = DependencyMatcher(en_vocab)
-    matcher.add("pattern1", [pattern1])
-    matcher.add("pattern2", [pattern2])
-    matcher.add("pattern3", [pattern3])
+    on_match = Mock()
+    matcher.add("pattern1", [pattern1], on_match=on_match)
+    matcher.add("pattern2", [pattern2], on_match=on_match)
+    matcher.add("pattern3", [pattern3], on_match=on_match)
+    matcher.add("pattern4", [pattern4], on_match=on_match)

-    return matcher
+    assert len(dependency_matcher) == 4

+    text = "The quick brown fox jumped over the lazy fox"
+    heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+    deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]

-def test_dependency_matcher_compile(dependency_matcher):
-    assert len(dependency_matcher) == 3
+    doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
+    matches = dependency_matcher(doc)

-
-# def test_dependency_matcher(dependency_matcher, text, heads, deps):
-#     doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
-#     matches = dependency_matcher(doc)
-#     assert matches[0][1] == [[3, 1, 2]]
-#     assert matches[1][1] == [[4, 3, 3]]
-#     assert matches[2][1] == [[4, 3, 2]]
+    assert len(matches) == 3
+    assert matches[0][1] == [[3, 1, 2]]
+    assert matches[1][1] == [[4, 3, 3]]
+    assert matches[2][1] == [[4, 3, 2]]
+    assert on_match.call_count == 3


 def test_matcher_basic_check(en_vocab):
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,3 +1,5 @@
+# coding: utf8
+from __future__ import unicode_literals
 from spacy.lang.en import English