mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Update v2.2.x for bugfix release (#6384)
* Fix on_match callback and remove empty patterns (#6312) For the `DependencyMatcher`: * Fix on_match callback so that it is called once per matched pattern * Fix results so that patterns with empty match lists are not returned * Add --prefer-binary for python 3.5 * Add version pins for pyrsistent * Use backwards-compatible super() * Try to fix tests on Travis (2.7) * Fix naming conflict and formatting * Update pkuseg version in Chinese tokenizer warnings * Some changes for Armenian (#5616) * Fixing numericals * We need a Armenian question sign to make the sentence a question * Update lex_attrs.py (#5608) * Fix compat * Update Armenian from v2.3.x Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Karen Hambardzumyan <mahnerak@gmail.com> Co-authored-by: Marat M. Yavrumyan <myavrum@ysu.am>
This commit is contained in:
parent
a41e28ceba
commit
ada4fc0f09
|
@ -80,8 +80,8 @@ jobs:
|
|||
architecture: 'x64'
|
||||
|
||||
- script: |
|
||||
python -m pip install -U setuptools
|
||||
pip install -r requirements.txt
|
||||
python -m pip install -U pip setuptools
|
||||
pip install -r requirements.txt --prefer-binary
|
||||
displayName: 'Install dependencies'
|
||||
|
||||
- script: |
|
||||
|
@ -96,7 +96,7 @@ jobs:
|
|||
|
||||
- bash: |
|
||||
SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
|
||||
pip install dist/$SDIST
|
||||
pip install dist/$SDIST --prefer-binary
|
||||
displayName: 'Install from sdist'
|
||||
|
||||
- script: python -m pytest --pyargs spacy
|
||||
|
|
|
@ -14,6 +14,7 @@ plac>=0.9.6,<1.2.0
|
|||
pathlib==1.0.1; python_version < "3.4"
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Optional dependencies
|
||||
pyrsistent<0.17.0
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
|
|
|
@ -7,8 +7,8 @@ def add_codes(err_cls):
|
|||
|
||||
class ErrorsWithCodes(err_cls):
|
||||
def __getattribute__(self, code):
|
||||
msg = super().__getattribute__(code)
|
||||
if code.startswith('__'): # python system attributes like __class__
|
||||
msg = super(ErrorsWithCodes, self).__getattribute__(code)
|
||||
if code.startswith("__"): # python system attributes like __class__
|
||||
return msg
|
||||
else:
|
||||
return "[{code}] {msg}".format(code=code, msg=msg)
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
class ArmenianDefaults(Language.Defaults):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.hy.examples import sentences
|
||||
|
@ -11,6 +11,6 @@ Example sentences to test spaCy and its language models.
|
|||
sentences = [
|
||||
"Լոնդոնը Միացյալ Թագավորության մեծ քաղաք է։",
|
||||
"Ո՞վ է Ֆրանսիայի նախագահը։",
|
||||
"Որն է Միացյալ Նահանգների մայրաքաղաքը։",
|
||||
"Ո՞րն է Միացյալ Նահանգների մայրաքաղաքը։",
|
||||
"Ե՞րբ է ծնվել Բարաք Օբաման։",
|
||||
]
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
"զրօ",
|
||||
"մէկ",
|
||||
"զրո",
|
||||
"մեկ",
|
||||
"երկու",
|
||||
"երեք",
|
||||
"չորս",
|
||||
|
@ -17,20 +18,21 @@ _num_words = [
|
|||
"տասը",
|
||||
"տասնմեկ",
|
||||
"տասներկու",
|
||||
"տասներեք",
|
||||
"տասնչորս",
|
||||
"տասնհինգ",
|
||||
"տասնվեց",
|
||||
"տասնյոթ",
|
||||
"տասնութ",
|
||||
"տասնինը",
|
||||
"քսան" "երեսուն",
|
||||
"տասներեք",
|
||||
"տասնչորս",
|
||||
"տասնհինգ",
|
||||
"տասնվեց",
|
||||
"տասնյոթ",
|
||||
"տասնութ",
|
||||
"տասնինը",
|
||||
"քսան",
|
||||
"երեսուն",
|
||||
"քառասուն",
|
||||
"հիսուն",
|
||||
"վաթցսուն",
|
||||
"վաթսուն",
|
||||
"յոթանասուն",
|
||||
"ութսուն",
|
||||
"ինիսուն",
|
||||
"իննսուն",
|
||||
"հարյուր",
|
||||
"հազար",
|
||||
"միլիոն",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
նա
|
||||
|
@ -105,6 +105,6 @@ STOP_WORDS = set(
|
|||
յուրաքանչյուր
|
||||
այս
|
||||
մեջ
|
||||
թ
|
||||
թ
|
||||
""".split()
|
||||
)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ class PolishLemmatizer(Lemmatizer):
|
|||
# lemmatization for nouns
|
||||
def __init__(self, lookups, *args, **kwargs):
|
||||
# this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
|
||||
super().__init__(lookups)
|
||||
super(PolishLemmatizer, self).__init__(lookups)
|
||||
self.lemma_lookups = {}
|
||||
for tag in [
|
||||
"ADJ",
|
||||
|
|
|
@ -16,7 +16,7 @@ from .tag_map import TAG_MAP
|
|||
from ... import util
|
||||
|
||||
|
||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.22` or from https://github.com/lancopku/pkuseg-python"
|
||||
_PKUSEG_INSTALL_MSG = "install it with `pip install pkuseg==0.0.25` or from https://github.com/lancopku/pkuseg-python"
|
||||
|
||||
|
||||
def try_jieba_import(use_jieba):
|
||||
|
@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
if reset:
|
||||
try:
|
||||
import pkuseg
|
||||
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None)
|
||||
except ImportError:
|
||||
if self.use_pkuseg:
|
||||
|
@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
)
|
||||
raise ImportError(msg)
|
||||
for word in words:
|
||||
self.pkuseg_seg.preprocesser.insert(word.strip(), '')
|
||||
self.pkuseg_seg.preprocesser.insert(word.strip(), "")
|
||||
|
||||
def _get_config(self):
|
||||
config = OrderedDict(
|
||||
|
@ -168,21 +169,16 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data, **kwargs):
|
||||
pkuseg_features_b = b""
|
||||
pkuseg_weights_b = b""
|
||||
pkuseg_processors_data = None
|
||||
pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None}
|
||||
|
||||
def deserialize_pkuseg_features(b):
|
||||
nonlocal pkuseg_features_b
|
||||
pkuseg_features_b = b
|
||||
pkuseg_data["features_b"] = b
|
||||
|
||||
def deserialize_pkuseg_weights(b):
|
||||
nonlocal pkuseg_weights_b
|
||||
pkuseg_weights_b = b
|
||||
pkuseg_data["weights_b"] = b
|
||||
|
||||
def deserialize_pkuseg_processors(b):
|
||||
nonlocal pkuseg_processors_data
|
||||
pkuseg_processors_data = srsly.msgpack_loads(b)
|
||||
pkuseg_data["processors_data"] = srsly.msgpack_loads(b)
|
||||
|
||||
deserializers = OrderedDict(
|
||||
(
|
||||
|
@ -194,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
)
|
||||
util.from_bytes(data, deserializers, [])
|
||||
|
||||
if pkuseg_features_b and pkuseg_weights_b:
|
||||
if pkuseg_data["features_b"] and pkuseg_data["weights_b"]:
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
tempdir = Path(tempdir)
|
||||
with open(tempdir / "features.pkl", "wb") as fileh:
|
||||
fileh.write(pkuseg_features_b)
|
||||
fileh.write(pkuseg_data["features_b"])
|
||||
with open(tempdir / "weights.npz", "wb") as fileh:
|
||||
fileh.write(pkuseg_weights_b)
|
||||
fileh.write(pkuseg_data["weights_b"])
|
||||
try:
|
||||
import pkuseg
|
||||
except ImportError:
|
||||
|
@ -209,13 +205,9 @@ class ChineseTokenizer(DummyTokenizer):
|
|||
+ _PKUSEG_INSTALL_MSG
|
||||
)
|
||||
self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
|
||||
if pkuseg_processors_data:
|
||||
(
|
||||
user_dict,
|
||||
do_process,
|
||||
common_words,
|
||||
other_words,
|
||||
) = pkuseg_processors_data
|
||||
if pkuseg_data["processors_data"]:
|
||||
processors_data = pkuseg_data["processors_data"]
|
||||
(user_dict, do_process, common_words, other_words) = processors_data
|
||||
self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict)
|
||||
self.pkuseg_seg.postprocesser.do_process = do_process
|
||||
self.pkuseg_seg.postprocesser.common_words = set(common_words)
|
||||
|
|
|
@ -235,12 +235,12 @@ cdef class DependencyMatcher:
|
|||
|
||||
matched_trees = []
|
||||
self.recurse(_tree,id_to_position,_node_operator_map,0,[],matched_trees)
|
||||
matched_key_trees.append((key,matched_trees))
|
||||
|
||||
for i, (ent_id, nodes) in enumerate(matched_key_trees):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matched_key_trees)
|
||||
if len(matched_trees) > 0:
|
||||
matched_key_trees.append((key,matched_trees))
|
||||
for i, (ent_id, nodes) in enumerate(matched_key_trees):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matched_key_trees)
|
||||
return matched_key_trees
|
||||
|
||||
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
|
|
@ -7,6 +7,7 @@ from mock import Mock
|
|||
from spacy.matcher import Matcher, DependencyMatcher
|
||||
from spacy.tokens import Doc, Token
|
||||
from ..doc.test_underscore import clean_underscore # noqa: F401
|
||||
from ..util import get_doc
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -301,22 +302,6 @@ def test_matcher_extension_set_membership(en_vocab):
|
|||
assert len(matches) == 0
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text():
|
||||
return "The quick brown fox jumped over the lazy fox"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def heads():
|
||||
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deps():
|
||||
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dependency_matcher(en_vocab):
|
||||
def is_brown_yellow(text):
|
||||
return bool(re.compile(r"brown|yellow|over").match(text))
|
||||
|
@ -359,24 +344,40 @@ def dependency_matcher(en_vocab):
|
|||
},
|
||||
]
|
||||
|
||||
# pattern that doesn't match
|
||||
pattern4 = [
|
||||
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "NOMATCH"}},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||
"PATTERN": {"ORTH": "fox"},
|
||||
},
|
||||
{
|
||||
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"},
|
||||
"PATTERN": {"ORTH": "brown"},
|
||||
},
|
||||
]
|
||||
|
||||
matcher = DependencyMatcher(en_vocab)
|
||||
matcher.add("pattern1", [pattern1])
|
||||
matcher.add("pattern2", [pattern2])
|
||||
matcher.add("pattern3", [pattern3])
|
||||
on_match = Mock()
|
||||
matcher.add("pattern1", [pattern1], on_match=on_match)
|
||||
matcher.add("pattern2", [pattern2], on_match=on_match)
|
||||
matcher.add("pattern3", [pattern3], on_match=on_match)
|
||||
matcher.add("pattern4", [pattern4], on_match=on_match)
|
||||
|
||||
return matcher
|
||||
assert len(dependency_matcher) == 4
|
||||
|
||||
text = "The quick brown fox jumped over the lazy fox"
|
||||
heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||
deps = ["det", "amod", "amod", "nsubj", "ROOT", "prep", "pobj", "det", "amod"]
|
||||
|
||||
def test_dependency_matcher_compile(dependency_matcher):
|
||||
assert len(dependency_matcher) == 3
|
||||
doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||
matches = dependency_matcher(doc)
|
||||
|
||||
|
||||
# def test_dependency_matcher(dependency_matcher, text, heads, deps):
|
||||
# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||
# matches = dependency_matcher(doc)
|
||||
# assert matches[0][1] == [[3, 1, 2]]
|
||||
# assert matches[1][1] == [[4, 3, 3]]
|
||||
# assert matches[2][1] == [[4, 3, 2]]
|
||||
assert len(matches) == 3
|
||||
assert matches[0][1] == [[3, 1, 2]]
|
||||
assert matches[1][1] == [[4, 3, 3]]
|
||||
assert matches[2][1] == [[4, 3, 2]]
|
||||
assert on_match.call_count == 3
|
||||
|
||||
|
||||
def test_matcher_basic_check(en_vocab):
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user