Remove unicode declarations and tidy up

2025-07-23 06:29:48 +03:00 · 2020-06-21 22:34:10 +02:00 · 2020-06-21 22:34:10 +02:00 · 40bb918a4c
commit 40bb918a4c
parent 689600e17d
35 changed files with 76 additions and 147 deletions
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
--- a/spacy/lang/gu/init.py
+++ b/spacy/lang/gu/init.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from ...language import Language
--- a/spacy/lang/gu/examples.py
+++ b/spacy/lang/gu/examples.py
@ -1,7 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set(
    """
 એમ
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set(
    """
 նա
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
--- a/spacy/lang/ja/bunsetu.py
+++ b/spacy/lang/ja/bunsetu.py
@ -1,21 +1,11 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 POS_PHRASE_MAP = {
    "NOUN": "NP",
    "NUM": "NP",
    "PRON": "NP",
    "PROPN": "NP",
    "VERB": "VP",
    "ADJ": "ADJP",
    "ADV": "ADVP",
    "CCONJ": "CCONJP",
 }
@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
        dep = t.dep_
        head = t.head.i
        if debug:
-            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
+            print(
                t.i,
                t.orth_,
                pos,
                pos_type,
                dep,
                head,
                bunsetu_may_end,
                phrase_type,
                phrase,
                bunsetu,
            )
        # DET is always an individual bunsetu
        if pos == "DET":
@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):
        # entering new bunsetu
        elif pos_type and (
-            pos_type != phrase_type or  # different phrase type arises
+            pos_type != phrase_type
-            bunsetu_may_end  # same phrase type but bunsetu already ended
+            or bunsetu_may_end  # different phrase type arises  # same phrase type but bunsetu already ended
        ):
            # exceptional case: NOUN to VERB
-            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
+            if (
                phrase_type == "NP"
                and pos_type == "VP"
                and prev_dep == "compound"
                and prev_head == t.i
            ):
                bunsetu.append(t)
                phrase_type = "VP"
                phrase.append(t)
            # exceptional case: VERB to NOUN
-            elif phrase_type == "VP" and pos_type == "NP" and (
+            elif (
-                    prev_dep == 'compound' and prev_head == t.i or
+                phrase_type == "VP"
-                    dep == 'compound' and prev == head or
+                and pos_type == "NP"
-                    prev_dep == 'nmod' and prev_head == t.i
+                and (
                    prev_dep == "compound"
                    and prev_head == t.i
                    or dep == "compound"
                    and prev == head
                    or prev_dep == "nmod"
                    and prev_head == t.i
                )
            ):
                bunsetu.append(t)
                phrase_type = "NP"
@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
        # NOUN bunsetu
        elif phrase_type == "NP":
            bunsetu.append(t)
-            if not bunsetu_may_end and ((
+            if not bunsetu_may_end and (
-                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
+                (
-            ) or (
+                    (pos_type == "NP" or pos == "SYM")
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
+                    and (prev_head == t.i or prev_head == head)
-            )):
+                    and prev_dep in {"compound", "nummod"}
                )
                or (
                    pos == "PART"
                    and (prev == head or prev_head == head)
                    and dep == "mark"
                )
            ):
                phrase.append(t)
            else:
                bunsetu_may_end = True
@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
        # VERB bunsetu
        elif phrase_type == "VP":
            bunsetu.append(t)
-            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
+            if (
                not bunsetu_may_end
                and pos == "VERB"
                and prev_head == t.i
                and prev_dep == "compound"
            ):
                phrase.append(t)
            else:
                bunsetu_may_end = True
        # ADJ bunsetu
-        elif phrase_type == "ADJP" and tag != '連体詞':
+        elif phrase_type == "ADJP" and tag != "連体詞":
            bunsetu.append(t)
-            if not bunsetu_may_end and ((
+            if not bunsetu_may_end and (
-                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
+                (
-            ) or (
+                    pos == "NOUN"
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
+                    and (prev_head == t.i or prev_head == head)
-            )):
+                    and prev_dep in {"amod", "compound"}
                )
                or (
                    pos == "PART"
                    and (prev == head or prev_head == head)
                    and dep == "mark"
                )
            ):
                phrase.append(t)
            else:
                bunsetu_may_end = True
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import NOUN, PROPN, PRON, VERB
 # XXX this can probably be pruned a bit
@ -19,6 +16,7 @@ labels = [
    "ROOT",
 ]
 def noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
@ -52,4 +50,5 @@ def noun_chunks(obj):
                seen.update(w.i for w in word.head.rights)
            yield unseen[0], word.i + 1, np_label
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/kn/examples.py
+++ b/spacy/lang/kn/examples.py
@ -1,7 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
--- a/spacy/lang/ml/init.py
+++ b/spacy/lang/ml/init.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .stop_words import STOP_WORDS
 from ...language import Language
--- a/spacy/lang/ml/examples.py
+++ b/spacy/lang/ml/examples.py
@ -1,7 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@ -1,7 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 STOP_WORDS = set(
    """
 അത്
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES
--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/hy/test_text.py
+++ b/spacy/tests/lang/hy/test_text.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from spacy.lang.hy.lex_attrs import like_num
--- a/spacy/tests/lang/hy/test_tokenizer.py
+++ b/spacy/tests/lang/hy/test_tokenizer.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@ -1,7 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.lang.ja import Japanese
 from ...util import make_tempdir
--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
--- a/spacy/tests/lang/sv/test_lex_attrs.py
+++ b/spacy/tests/lang/sv/test_lex_attrs.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.lang.sv.lex_attrs import like_num
--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 from spacy.lang.zh import Chinese
 from ...util import make_tempdir
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,3 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.lang.en import English
--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -1,4 +1,3 @@
 # coding: utf8
 import warnings
 from unittest import TestCase
 import pytest
--- a/spacy/tests/regression/test_issue5458.py
+++ b/spacy/tests/regression/test_issue5458.py
@ -1,6 +1,3 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.tests.util import get_doc