mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Update v2.2.x for bugfix release (#6384)
* Fix on_match callback and remove empty patterns (#6312) For the `DependencyMatcher`: * Fix on_match callback so that it is called once per matched pattern * Fix results so that patterns with empty match lists are not returned * Add --prefer-binary for python 3.5 * Add version pins for pyrsistent * Use backwards-compatible super() * Try to fix tests on Travis (2.7) * Fix naming conflict and formatting * Update pkuseg version in Chinese tokenizer warnings * Some changes for Armenian (#5616) * Fixing numericals * We need a Armenian question sign to make the sentence a question * Update lex_attrs.py (#5608) * Fix compat * Update Armenian from v2.3.x Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Karen Hambardzumyan <mahnerak@gmail.com> Co-authored-by: Marat M. Yavrumyan <myavrum@ysu.am>
This commit is contained in:
parent
a41e28ceba
commit
ada4fc0f09
|
@ -80,8 +80,8 @@ jobs:
|
||||||
architecture: 'x64'
|
architecture: 'x64'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
python -m pip install -U setuptools
|
python -m pip install -U pip setuptools
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt --prefer-binary
|
||||||
displayName: 'Install dependencies'
|
displayName: 'Install dependencies'
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
|
@ -96,7 +96,7 @@ jobs:
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||||
pip install dist/$SDIST
|
pip install dist/$SDIST --prefer-binary
|
||||||
displayName: 'Install from sdist'
|
displayName: 'Install from sdist'
|
||||||
|
|
||||||
- script: python -m pytest --pyargs spacy
|
- script: python -m pytest --pyargs spacy
|
||||||
|
|
|
@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
tqdm>=4.38.0,<5.0.0
|
tqdm>=4.38.0,<5.0.0
|
||||||
# Optional dependencies
|
# Optional dependencies
|
||||||
|
pyrsistent<0.17.0
|
||||||
jsonschema>=2.6.0,<3.1.0
|
jsonschema>=2.6.0,<3.1.0
|
||||||
# Development dependencies
|
# Development dependencies
|
||||||
cython>=0.25
|
cython>=0.25
|
||||||
|
|
|
@ -7,8 +7,8 @@ def add_codes(err_cls):
|
||||||
|
|
||||||
class ErrorsWithCodes(err_cls):
|
class ErrorsWithCodes(err_cls):
|
||||||
def __getattribute__(self, code):
|
def __getattribute__(self, code):
|
||||||
msg = super().__getattribute__(code)
|
msg = super(ErrorsWithCodes, self).__getattribute__(code)
|
||||||
if code.startswith('__'): # python system attributes like __class__
|
if code.startswith("__"): # python system attributes like __class__
|
||||||
return msg
|
return msg
|
||||||
else:
|
else:
|
||||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
class ArmenianDefaults(Language.Defaults):
|
class ArmenianDefaults(Language.Defaults):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.hy.examples import sentences
|
>>> from spacy.lang.hy.examples import sentences
|
||||||
|
@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models.
|
||||||
sentences = [
|
sentences = [
|
||||||
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
|
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
|
||||||
"Ո՞վ է Ֆրանսիայի նախագահը։",
|
"Ո՞վ է Ֆրանսիայի նախագահը։",
|
||||||
"Որն է Միացյալ Նահանգների մայրաքաղաքը։",
|
"Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։",
|
||||||
"Ե՞րբ է ծնվել Բարաք Օբաման։",
|
"Ե՞րբ է ծնվել Բարաք Օբաման։",
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = [
|
_num_words = [
|
||||||
"զրօ",
|
"զրո",
|
||||||
"մէկ",
|
"մեկ",
|
||||||
"երկու",
|
"երկու",
|
||||||
"երեք",
|
"երեք",
|
||||||
"չորս",
|
"չորս",
|
||||||
|
@ -17,20 +18,21 @@ _num_words = [
|
||||||
"տասը",
|
"տասը",
|
||||||
"տասնմեկ",
|
"տասնմեկ",
|
||||||
"տասներկու",
|
"տասներկու",
|
||||||
"տասներեք",
|
"տասներեք",
|
||||||
"տասնչորս",
|
"տասնչորս",
|
||||||
"տասնհինգ",
|
"տասնհինգ",
|
||||||
"տասնվեց",
|
"տասնվեց",
|
||||||
"տասնյոթ",
|
"տասնյոթ",
|
||||||
"տասնութ",
|
"տասնութ",
|
||||||
"տասնինը",
|
"տասնինը",
|
||||||
"քսան" "երեսուն",
|
"քսան",
|
||||||
|
"երեսուն",
|
||||||
"քառասուն",
|
"քառասուն",
|
||||||
"հիսուն",
|
"հիսուն",
|
||||||
"վաթցսուն",
|
"վաթսուն",
|
||||||
"յոթանասուն",
|
"յոթանասուն",
|
||||||
"ութսուն",
|
"ութսուն",
|
||||||
"ինիսուն",
|
"իննսուն",
|
||||||
"հարյուր",
|
"հարյուր",
|
||||||
"հազար",
|
"հազար",
|
||||||
"միլիոն",
|
"միլիոն",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
նա
|
նա
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ class PolishLemmatizer(Lemmatizer):
|
||||||
# lemmatization for nouns
|
# lemmatization for nouns
|
||||||
def __init__(self, lookups, *args, **kwargs):
|
def __init__(self, lookups, *args, **kwargs):
|
||||||
# this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
|
# this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
|
||||||
super().__init__(lookups)
|
super(PolishLemmatizer, self).__init__(lookups)
|
||||||
self.lemma_lookups = {}
|
self.lemma_lookups = {}
|
||||||
for tag in [
|
for tag in [
|
||||||
"ADJ",
|
"ADJ",
|
||||||
|
|
|
@ -16,7 +16,7 @@ from .tag_map import TAG_MAP
|
||||||
from ... import util
|
from ... import util
|
||||||
|
|
||||||
|
|
||||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
|
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
||||||
|
|
||||||
|
|
||||||
def try_jieba_import(use_jieba):
|
def try_jieba_import(use_jieba):
|
||||||
|
@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
if reset:
|
if reset:
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
if self.use_pkuseg:
|
if self.use_pkuseg:
|
||||||
|
@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
raise ImportError(msg)
|
raise ImportError(msg)
|
||||||
for word in words:
|
for word in words:
|
||||||
self.pkuseg_seg.preprocesser.insert(word.strip(), '')
|
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
||||||
|
|
||||||
def _get_config(self):
|
def _get_config(self):
|
||||||
config = OrderedDict(
|
config = OrderedDict(
|
||||||
|
@ -168,21 +169,16 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
return util.to_bytes(serializers, [])
|
return util.to_bytes(serializers, [])
|
||||||
|
|
||||||
def from_bytes(self, data, **kwargs):
|
def from_bytes(self, data, **kwargs):
|
||||||
pkuseg_features_b = b""
|
pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None}
|
||||||
pkuseg_weights_b = b""
|
|
||||||
pkuseg_processors_data = None
|
|
||||||
|
|
||||||
def deserialize_pkuseg_features(b):
|
def deserialize_pkuseg_features(b):
|
||||||
nonlocal pkuseg_features_b
|
pkuseg_data["features_b"] = b
|
||||||
pkuseg_features_b = b
|
|
||||||
|
|
||||||
def deserialize_pkuseg_weights(b):
|
def deserialize_pkuseg_weights(b):
|
||||||
nonlocal pkuseg_weights_b
|
pkuseg_data["weights_b"] = b
|
||||||
pkuseg_weights_b = b
|
|
||||||
|
|
||||||
def deserialize_pkuseg_processors(b):
|
def deserialize_pkuseg_processors(b):
|
||||||
nonlocal pkuseg_processors_data
|
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
||||||
pkuseg_processors_data = srsly.msgpack_loads(b)
|
|
||||||
|
|
||||||
deserializers = OrderedDict(
|
deserializers = OrderedDict(
|
||||||
(
|
(
|
||||||
|
@ -194,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
)
|
)
|
||||||
util.from_bytes(data, deserializers, [])
|
util.from_bytes(data, deserializers, [])
|
||||||
|
|
||||||
if pkuseg_features_b and pkuseg_weights_b:
|
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||||
with tempfile.TemporaryDirectory() as tempdir:
|
with tempfile.TemporaryDirectory() as tempdir:
|
||||||
tempdir = Path(tempdir)
|
tempdir = Path(tempdir)
|
||||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||||
fileh.write(pkuseg_features_b)
|
fileh.write(pkuseg_data["features_b"])
|
||||||
with open(tempdir / "weights.npz", "wb") as fileh:
|
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||||
fileh.write(pkuseg_weights_b)
|
fileh.write(pkuseg_data["weights_b"])
|
||||||
try:
|
try:
|
||||||
import pkuseg
|
import pkuseg
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -209,13 +205,9 @@ class ChineseTokenizer(DummyTokenizer):
|
||||||
+ _PKUSEG_INSTALL_MSG
|
+ _PKUSEG_INSTALL_MSG
|
||||||
)
|
)
|
||||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
||||||
if pkuseg_processors_data:
|
if pkuseg_data["processors_data"]:
|
||||||
(
|
processors_data = pkuseg_data["processors_data"]
|
||||||
user_dict,
|
(user_dict, do_process, common_words, other_words) = processors_data
|
||||||
do_process,
|
|
||||||
common_words,
|
|
||||||
other_words,
|
|
||||||
) = pkuseg_processors_data
|
|
||||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||||
|
|
|
@ -235,8 +235,8 @@ cdef class DependencyMatcher:
|
||||||
|
|
||||||
matched_trees = []
|
matched_trees = []
|
||||||
self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
|
self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
|
||||||
|
if len(matched_trees) > 0:
|
||||||
matched_key_trees.append((key,matched_trees))
|
matched_key_trees.append((key,matched_trees))
|
||||||
|
|
||||||
for i, (ent_id, nodes) in enumerate(matched_key_trees):
|
for i, (ent_id, nodes) in enumerate(matched_key_trees):
|
||||||
on_match = self._callbacks.get(ent_id)
|
on_match = self._callbacks.get(ent_id)
|
||||||
if on_match is not None:
|
if on_match is not None:
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
|
@ -7,6 +7,7 @@ from mock import Mock
|
||||||
from spacy.matcher import Matcher, DependencyMatcher
|
from spacy.matcher import Matcher, DependencyMatcher
|
||||||
from spacy.tokens import Doc, Token
|
from spacy.tokens import Doc, Token
|
||||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||||
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -301,22 +302,6 @@ def test_matcher_extension_set_membership(en_vocab):
|
||||||
assert len(matches) == 0
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def text():
|
|
||||||
return "The quick brown fox jumped over the lazy fox"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def heads():
|
|
||||||
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def deps():
|
|
||||||
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def dependency_matcher(en_vocab):
|
def dependency_matcher(en_vocab):
|
||||||
def is_brown_yellow(text):
|
def is_brown_yellow(text):
|
||||||
return bool(re.compile(r"brown|yellow|over").match(text))
|
return bool(re.compile(r"brown|yellow|over").match(text))
|
||||||
|
@ -359,24 +344,40 @@ def dependency_matcher(en_vocab):
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# pattern that doesn't match
|
||||||
|
pattern4 = [
|
||||||
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "NOMATCH"}},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"ORTH": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {"ORTH": "brown"},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
matcher = DependencyMatcher(en_vocab)
|
matcher = DependencyMatcher(en_vocab)
|
||||||
matcher.add("pattern1", [pattern1])
|
on_match = Mock()
|
||||||
matcher.add("pattern2", [pattern2])
|
matcher.add("pattern1", [pattern1], on_match=on_match)
|
||||||
matcher.add("pattern3", [pattern3])
|
matcher.add("pattern2", [pattern2], on_match=on_match)
|
||||||
|
matcher.add("pattern3", [pattern3], on_match=on_match)
|
||||||
|
matcher.add("pattern4", [pattern4], on_match=on_match)
|
||||||
|
|
||||||
return matcher
|
assert len(dependency_matcher) == 4
|
||||||
|
|
||||||
|
text = "The quick brown fox jumped over the lazy fox"
|
||||||
|
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
|
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
|
||||||
|
|
||||||
def test_dependency_matcher_compile(dependency_matcher):
|
doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||||
assert len(dependency_matcher) == 3
|
matches = dependency_matcher(doc)
|
||||||
|
|
||||||
|
assert len(matches) == 3
|
||||||
# def test_dependency_matcher(dependency_matcher, text, heads, deps):
|
assert matches[0][1] == [[3, 1, 2]]
|
||||||
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
assert matches[1][1] == [[4, 3, 3]]
|
||||||
# matches = dependency_matcher(doc)
|
assert matches[2][1] == [[4, 3, 2]]
|
||||||
# assert matches[0][1] == [[3, 1, 2]]
|
assert on_match.call_count == 3
|
||||||
# assert matches[1][1] == [[4, 3, 3]]
|
|
||||||
# assert matches[2][1] == [[4, 3, 2]]
|
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_basic_check(en_vocab):
|
def test_matcher_basic_check(en_vocab):
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user