Update v2.2.x for bugfix release (#6384)

* Fix on_match callback and remove empty patterns (#6312)

For the `DependencyMatcher`:

* Fix on_match callback so that it is called once per matched pattern
* Fix results so that patterns with empty match lists are not returned

* Add --prefer-binary for python 3.5

* Add version pins for pyrsistent

* Use backwards-compatible super()

* Try to fix tests on Travis (2.7)

* Fix naming conflict and formatting

* Update pkuseg version in Chinese tokenizer warnings

* Some changes for Armenian (#5616)

* Fixing numericals

* We need a Armenian question sign to make the sentence a question

* Update lex_attrs.py (#5608)

* Fix compat

* Update Armenian from v2.3.x

Co-authored-by: Ines Montani <ines@ines.io>
Co-authored-by: Karen Hambardzumyan <mahnerak@gmail.com>
Co-authored-by: Marat M. Yavrumyan <myavrum@ysu.am>
This commit is contained in:
Adriane Boyd 2020-11-14 09:20:42 +01:00 committed by GitHub
parent a41e28ceba
commit ada4fc0f09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 209 additions and 383 deletions

View File

@ -80,8 +80,8 @@ jobs:
architecture: 'x64' architecture: 'x64'
- script: | - script: |
python -m pip install -U setuptools python -m pip install -U pip setuptools
pip install -r requirements.txt pip install -r requirements.txt --prefer-binary
displayName: 'Install dependencies' displayName: 'Install dependencies'
- script: | - script: |
@ -96,7 +96,7 @@ jobs:
- bash: | - bash: |
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
pip install dist/$SDIST pip install dist/$SDIST --prefer-binary
displayName: 'Install from sdist' displayName: 'Install from sdist'
- script: python -m pytest --pyargs spacy - script: python -m pytest --pyargs spacy

View File

@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0
pathlib==1.0.1; python_version < "3.4" pathlib==1.0.1; python_version < "3.4"
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
# Optional dependencies # Optional dependencies
pyrsistent<0.17.0
jsonschema>=2.6.0,<3.1.0 jsonschema>=2.6.0,<3.1.0
# Development dependencies # Development dependencies
cython>=0.25 cython>=0.25

View File

@ -7,8 +7,8 @@ def add_codes(err_cls):
class ErrorsWithCodes(err_cls): class ErrorsWithCodes(err_cls):
def __getattribute__(self, code): def __getattribute__(self, code):
msg = super().__getattribute__(code) msg = super(ErrorsWithCodes, self).__getattribute__(code)
if code.startswith('__'): # python system attributes like __class__ if code.startswith("__"): # python system attributes like __class__
return msg return msg
else: else:
return "[{code}] {msg}".format(code=code, msg=msg) return "[{code}] {msg}".format(code=code, msg=msg)

View File

@ -1,11 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc
class ArmenianDefaults(Language.Defaults): class ArmenianDefaults(Language.Defaults):

View File

@ -1,6 +1,6 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
>>> from spacy.lang.hy.examples import sentences >>> from spacy.lang.hy.examples import sentences
@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models.
sentences = [ sentences = [
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։", "Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
"Ո՞վ է Ֆրանսիայի նախագահը։", "Ո՞վ է Ֆրանսիայի նախագահը։",
"Որն է Միացյալ Նահանգների մայրաքաղաքը։", "Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։",
"Ե՞րբ է ծնվել Բարաք Օբաման։", "Ե՞րբ է ծնվել Բարաք Օբաման։",
] ]

View File

@ -1,11 +1,12 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = [ _num_words = [
"զրօ", "զրո",
"մէկ", "մեկ",
"երկու", "երկու",
"երեք", "երեք",
"չորս", "չորս",
@ -17,20 +18,21 @@ _num_words = [
"տասը", "տասը",
"տասնմեկ", "տասնմեկ",
"տասներկու", "տասներկու",
"տասն­երեք", "տասներեք",
"տասն­չորս", "տասնչորս",
"տասն­հինգ", "տասնհինգ",
"տասն­վեց", "տասնվեց",
"տասն­յոթ", "տասնյոթ",
"տասն­ութ", "տասնութ",
"տասն­ինը", "տասնինը",
"քսան" "երեսուն", "քսան",
"երեսուն",
"քառասուն", "քառասուն",
"հիսուն", "հիսուն",
"վաթցսուն", "վաթսուն",
"յոթանասուն", "յոթանասուն",
"ութսուն", "ութսուն",
"ինիսուն", "իննսուն",
"հարյուր", "հարյուր",
"հազար", "հազար",
"միլիոն", "միլիոն",

View File

@ -1,6 +1,6 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
STOP_WORDS = set( STOP_WORDS = set(
""" """
նա նա

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@ class PolishLemmatizer(Lemmatizer):
# lemmatization for nouns # lemmatization for nouns
def __init__(self, lookups, *args, **kwargs): def __init__(self, lookups, *args, **kwargs):
# this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
super().__init__(lookups) super(PolishLemmatizer, self).__init__(lookups)
self.lemma_lookups = {} self.lemma_lookups = {}
for tag in [ for tag in [
"ADJ", "ADJ",

View File

@ -16,7 +16,7 @@ from .tag_map import TAG_MAP
from ... import util from ... import util
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python" _PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
def try_jieba_import(use_jieba): def try_jieba_import(use_jieba):
@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
if reset: if reset:
try: try:
import pkuseg import pkuseg
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
except ImportError: except ImportError:
if self.use_pkuseg: if self.use_pkuseg:
@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
) )
raise ImportError(msg) raise ImportError(msg)
for word in words: for word in words:
self.pkuseg_seg.preprocesser.insert(word.strip(), '') self.pkuseg_seg.preprocesser.insert(word.strip(), "")
def _get_config(self): def _get_config(self):
config = OrderedDict( config = OrderedDict(
@ -168,21 +169,16 @@ class ChineseTokenizer(DummyTokenizer):
return util.to_bytes(serializers, []) return util.to_bytes(serializers, [])
def from_bytes(self, data, **kwargs): def from_bytes(self, data, **kwargs):
pkuseg_features_b = b"" pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None}
pkuseg_weights_b = b""
pkuseg_processors_data = None
def deserialize_pkuseg_features(b): def deserialize_pkuseg_features(b):
nonlocal pkuseg_features_b pkuseg_data["features_b"] = b
pkuseg_features_b = b
def deserialize_pkuseg_weights(b): def deserialize_pkuseg_weights(b):
nonlocal pkuseg_weights_b pkuseg_data["weights_b"] = b
pkuseg_weights_b = b
def deserialize_pkuseg_processors(b): def deserialize_pkuseg_processors(b):
nonlocal pkuseg_processors_data pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
pkuseg_processors_data = srsly.msgpack_loads(b)
deserializers = OrderedDict( deserializers = OrderedDict(
( (
@ -194,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer):
) )
util.from_bytes(data, deserializers, []) util.from_bytes(data, deserializers, [])
if pkuseg_features_b and pkuseg_weights_b: if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
tempdir = Path(tempdir) tempdir = Path(tempdir)
with open(tempdir / "features.pkl", "wb") as fileh: with open(tempdir / "features.pkl", "wb") as fileh:
fileh.write(pkuseg_features_b) fileh.write(pkuseg_data["features_b"])
with open(tempdir / "weights.npz", "wb") as fileh: with open(tempdir / "weights.npz", "wb") as fileh:
fileh.write(pkuseg_weights_b) fileh.write(pkuseg_data["weights_b"])
try: try:
import pkuseg import pkuseg
except ImportError: except ImportError:
@ -209,13 +205,9 @@ class ChineseTokenizer(DummyTokenizer):
+ _PKUSEG_INSTALL_MSG + _PKUSEG_INSTALL_MSG
) )
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
if pkuseg_processors_data: if pkuseg_data["processors_data"]:
( processors_data = pkuseg_data["processors_data"]
user_dict, (user_dict, do_process, common_words, other_words) = processors_data
do_process,
common_words,
other_words,
) = pkuseg_processors_data
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.do_process = do_process
self.pkuseg_seg.postprocesser.common_words = set(common_words) self.pkuseg_seg.postprocesser.common_words = set(common_words)

View File

@ -235,8 +235,8 @@ cdef class DependencyMatcher:
matched_trees = [] matched_trees = []
self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees) self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
if len(matched_trees) > 0:
matched_key_trees.append((key,matched_trees)) matched_key_trees.append((key,matched_trees))
for i, (ent_id, nodes) in enumerate(matched_key_trees): for i, (ent_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest

View File

@ -7,6 +7,7 @@ from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token from spacy.tokens import Doc, Token
from ..doc.test_underscore import clean_underscore # noqa: F401 from ..doc.test_underscore import clean_underscore # noqa: F401
from ..util import get_doc
@pytest.fixture @pytest.fixture
@ -301,22 +302,6 @@ def test_matcher_extension_set_membership(en_vocab):
assert len(matches) == 0 assert len(matches) == 0
@pytest.fixture
def text():
return "The quick brown fox jumped over the lazy fox"
@pytest.fixture
def heads():
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
@pytest.fixture
def deps():
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
@pytest.fixture
def dependency_matcher(en_vocab): def dependency_matcher(en_vocab):
def is_brown_yellow(text): def is_brown_yellow(text):
return bool(re.compile(r"brown|yellow|over").match(text)) return bool(re.compile(r"brown|yellow|over").match(text))
@ -359,24 +344,40 @@ def dependency_matcher(en_vocab):
}, },
] ]
# pattern that doesn't match
pattern4 = [
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "NOMATCH"}},
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"ORTH": "fox"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
"PATTERN": {"ORTH": "brown"},
},
]
matcher = DependencyMatcher(en_vocab) matcher = DependencyMatcher(en_vocab)
matcher.add("pattern1", [pattern1]) on_match = Mock()
matcher.add("pattern2", [pattern2]) matcher.add("pattern1", [pattern1], on_match=on_match)
matcher.add("pattern3", [pattern3]) matcher.add("pattern2", [pattern2], on_match=on_match)
matcher.add("pattern3", [pattern3], on_match=on_match)
matcher.add("pattern4", [pattern4], on_match=on_match)
return matcher assert len(dependency_matcher) == 4
text = "The quick brown fox jumped over the lazy fox"
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
def test_dependency_matcher_compile(dependency_matcher): doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
assert len(dependency_matcher) == 3 matches = dependency_matcher(doc)
assert len(matches) == 3
# def test_dependency_matcher(dependency_matcher, text, heads, deps): assert matches[0][1] == [[3, 1, 2]]
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) assert matches[1][1] == [[4, 3, 3]]
# matches = dependency_matcher(doc) assert matches[2][1] == [[4, 3, 2]]
# assert matches[0][1] == [[3, 1, 2]] assert on_match.call_count == 3
# assert matches[1][1] == [[4, 3, 3]]
# assert matches[2][1] == [[4, 3, 2]]
def test_matcher_basic_check(en_vocab): def test_matcher_basic_check(en_vocab):

View File

@ -1,3 +1,5 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English