From 56de520afd2276e80f634ceb01e8c5a51ea64bb5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:04:57 +0200 Subject: [PATCH 1/2] Try to fix tests on Travis (2.7) --- spacy/lang/hy/examples.py | 1 + spacy/lang/hy/lex_attrs.py | 1 + spacy/lang/hy/stop_words.py | 3 ++- spacy/lang/zh/__init__.py | 36 ++++++++++++++------------------ spacy/tests/lang/hy/test_text.py | 1 + 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/spacy/lang/hy/examples.py b/spacy/lang/hy/examples.py index b0df31aae..d04204c55 100644 --- a/spacy/lang/hy/examples.py +++ b/spacy/lang/hy/examples.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals diff --git a/spacy/lang/hy/lex_attrs.py b/spacy/lang/hy/lex_attrs.py index 7c1b9592f..910625fb8 100644 --- a/spacy/lang/hy/lex_attrs.py +++ b/spacy/lang/hy/lex_attrs.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals from ...attrs import LIKE_NUM diff --git a/spacy/lang/hy/stop_words.py b/spacy/lang/hy/stop_words.py index c671956a4..3f2f7bb15 100644 --- a/spacy/lang/hy/stop_words.py +++ b/spacy/lang/hy/stop_words.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals @@ -105,6 +106,6 @@ STOP_WORDS = set( յուրաքանչյուր այս մեջ -թ +թ """.split() ) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index ed0b3eb74..508c5a03f 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -109,6 +109,7 @@ class ChineseTokenizer(DummyTokenizer): if reset: try: import pkuseg + self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(None) except ImportError: if self.use_pkuseg: @@ -118,7 +119,7 @@ class ChineseTokenizer(DummyTokenizer): ) raise ImportError(msg) for word in words: - self.pkuseg_seg.preprocesser.insert(word.strip(), '') + self.pkuseg_seg.preprocesser.insert(word.strip(), "") def _get_config(self): config = OrderedDict( @@ -168,21 +169,19 @@ class ChineseTokenizer(DummyTokenizer): return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): - pkuseg_features_b = b"" - pkuseg_weights_b = b"" - pkuseg_processors_data = None + data = {"features_b": b"", "weights_b": b"", "processors_data": None} + # pkuseg_features_b = b"" + # pkuseg_weights_b = b"" + # pkuseg_processors_data = None def deserialize_pkuseg_features(b): - nonlocal pkuseg_features_b - pkuseg_features_b = b + data["features_b"] = b def deserialize_pkuseg_weights(b): - nonlocal pkuseg_weights_b - pkuseg_weights_b = b + data["weights_b"] = b def deserialize_pkuseg_processors(b): - nonlocal pkuseg_processors_data - pkuseg_processors_data = srsly.msgpack_loads(b) + data["processors_data"] = srsly.msgpack_loads(b) deserializers = OrderedDict( ( @@ -194,13 +193,13 @@ class ChineseTokenizer(DummyTokenizer): ) util.from_bytes(data, deserializers, []) - if pkuseg_features_b and pkuseg_weights_b: + if data["features_b"] and data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: - fileh.write(pkuseg_features_b) + fileh.write(data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: - fileh.write(pkuseg_weights_b) + fileh.write(data["weights_b"]) try: import pkuseg except ImportError: @@ -209,13 +208,10 @@ class ChineseTokenizer(DummyTokenizer): + _PKUSEG_INSTALL_MSG ) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) - if pkuseg_processors_data: - ( - user_dict, - do_process, - common_words, - other_words, - ) = pkuseg_processors_data + if data["processors_data"]: + (user_dict, do_process, common_words, other_words) = data[ + "processors_data" + ] self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words) diff --git a/spacy/tests/lang/hy/test_text.py b/spacy/tests/lang/hy/test_text.py index 6b785bdfc..cbdb77e4e 100644 --- a/spacy/tests/lang/hy/test_text.py +++ b/spacy/tests/lang/hy/test_text.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals import pytest From bea863acd255407887806d1089c1f63896cdf084 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 May 2020 14:24:38 +0200 Subject: [PATCH 2/2] Fix naming conflict and formatting --- spacy/lang/zh/__init__.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 508c5a03f..9d1cb71a7 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -169,19 +169,16 @@ class ChineseTokenizer(DummyTokenizer): return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): - data = {"features_b": b"", "weights_b": b"", "processors_data": None} - # pkuseg_features_b = b"" - # pkuseg_weights_b = b"" - # pkuseg_processors_data = None + pkuseg_data = {"features_b": b"", "weights_b": b"", "processors_data": None} def deserialize_pkuseg_features(b): - data["features_b"] = b + pkuseg_data["features_b"] = b def deserialize_pkuseg_weights(b): - data["weights_b"] = b + pkuseg_data["weights_b"] = b def deserialize_pkuseg_processors(b): - data["processors_data"] = srsly.msgpack_loads(b) + pkuseg_data["processors_data"] = srsly.msgpack_loads(b) deserializers = OrderedDict( ( @@ -193,13 +190,13 @@ class ChineseTokenizer(DummyTokenizer): ) util.from_bytes(data, deserializers, []) - if data["features_b"] and data["weights_b"]: + if pkuseg_data["features_b"] and pkuseg_data["weights_b"]: with tempfile.TemporaryDirectory() as tempdir: tempdir = Path(tempdir) with open(tempdir / "features.pkl", "wb") as fileh: - fileh.write(data["features_b"]) + fileh.write(pkuseg_data["features_b"]) with open(tempdir / "weights.npz", "wb") as fileh: - fileh.write(data["weights_b"]) + fileh.write(pkuseg_data["weights_b"]) try: import pkuseg except ImportError: @@ -208,10 +205,9 @@ class ChineseTokenizer(DummyTokenizer): + _PKUSEG_INSTALL_MSG ) self.pkuseg_seg = pkuseg.pkuseg(str(tempdir)) - if data["processors_data"]: - (user_dict, do_process, common_words, other_words) = data[ - "processors_data" - ] + if pkuseg_data["processors_data"]: + processors_data = pkuseg_data["processors_data"] + (user_dict, do_process, common_words, other_words) = processors_data self.pkuseg_seg.preprocesser = pkuseg.Preprocesser(user_dict) self.pkuseg_seg.postprocesser.do_process = do_process self.pkuseg_seg.postprocesser.common_words = set(common_words)