Remove unicode declarations and tidy up

2025-08-02 19:30:19 +03:00 · 2020-06-21 22:34:10 +02:00 · 2020-06-21 22:34:10 +02:00 · 40bb918a4c
commit 40bb918a4c
parent 689600e17d
35 changed files with 76 additions and 147 deletions
--- a/spacy/lang/es/punctuation.py
+++ b/spacy/lang/es/punctuation.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
--- a/spacy/lang/gu/init.py
+++ b/spacy/lang/gu/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS

 from ...language import Language
--- a/spacy/lang/gu/examples.py
+++ b/spacy/lang/gu/examples.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/gu/stop_words.py
+++ b/spacy/lang/gu/stop_words.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 STOP_WORDS = set(
    """
 એમ
--- a/spacy/lang/hy/init.py
+++ b/spacy/lang/hy/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from .tag_map import TAG_MAP
--- a/spacy/lang/hy/examples.py
+++ b/spacy/lang/hy/examples.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.hy.examples import sentences
--- a/spacy/lang/hy/lex_attrs.py
+++ b/spacy/lang/hy/lex_attrs.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM


--- a/spacy/lang/hy/stop_words.py
+++ b/spacy/lang/hy/stop_words.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 STOP_WORDS = set(
    """
 նա
--- a/spacy/lang/hy/tag_map.py
+++ b/spacy/lang/hy/tag_map.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ

--- a/spacy/lang/ja/bunsetu.py
+++ b/spacy/lang/ja/bunsetu.py
@ -1,21 +1,11 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-from .stop_words import STOP_WORDS
-
-
 POS_PHRASE_MAP = {
    "NOUN": "NP",
    "NUM": "NP",
    "PRON": "NP",
    "PROPN": "NP",
-
    "VERB": "VP",
-
    "ADJ": "ADJP",
-
    "ADV": "ADVP",
-
    "CCONJ": "CCONJP",
 }

@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
        dep = t.dep_
        head = t.head.i
        if debug:
-            print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
+            print(
+                t.i,
+                t.orth_,
+                pos,
+                pos_type,
+                dep,
+                head,
+                bunsetu_may_end,
+                phrase_type,
+                phrase,
+                bunsetu,
+            )

        # DET is always an individual bunsetu
        if pos == "DET":
@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):

        # entering new bunsetu
        elif pos_type and (
-            pos_type != phrase_type or  # different phrase type arises
-            bunsetu_may_end  # same phrase type but bunsetu already ended
+            pos_type != phrase_type
+            or bunsetu_may_end  # different phrase type arises  # same phrase type but bunsetu already ended
        ):
            # exceptional case: NOUN to VERB
-            if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
+            if (
+                phrase_type == "NP"
+                and pos_type == "VP"
+                and prev_dep == "compound"
+                and prev_head == t.i
+            ):
                bunsetu.append(t)
                phrase_type = "VP"
                phrase.append(t)
            # exceptional case: VERB to NOUN
-            elif phrase_type == "VP" and pos_type == "NP" and (
-                    prev_dep == 'compound' and prev_head == t.i or
-                    dep == 'compound' and prev == head or
-                    prev_dep == 'nmod' and prev_head == t.i
+            elif (
+                phrase_type == "VP"
+                and pos_type == "NP"
+                and (
+                    prev_dep == "compound"
+                    and prev_head == t.i
+                    or dep == "compound"
+                    and prev == head
+                    or prev_dep == "nmod"
+                    and prev_head == t.i
+                )
            ):
                bunsetu.append(t)
                phrase_type = "NP"
@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
        # NOUN bunsetu
        elif phrase_type == "NP":
            bunsetu.append(t)
-            if not bunsetu_may_end and ((
-                (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
-            ) or (
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
-            )):
+            if not bunsetu_may_end and (
+                (
+                    (pos_type == "NP" or pos == "SYM")
+                    and (prev_head == t.i or prev_head == head)
+                    and prev_dep in {"compound", "nummod"}
+                )
+                or (
+                    pos == "PART"
+                    and (prev == head or prev_head == head)
+                    and dep == "mark"
+                )
+            ):
                phrase.append(t)
            else:
                bunsetu_may_end = True
@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
        # VERB bunsetu
        elif phrase_type == "VP":
            bunsetu.append(t)
-            if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
+            if (
+                not bunsetu_may_end
+                and pos == "VERB"
+                and prev_head == t.i
+                and prev_dep == "compound"
+            ):
                phrase.append(t)
            else:
                bunsetu_may_end = True

        # ADJ bunsetu
-        elif phrase_type == "ADJP" and tag != '連体詞':
+        elif phrase_type == "ADJP" and tag != "連体詞":
            bunsetu.append(t)
-            if not bunsetu_may_end and ((
-                pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
-            ) or (
-                pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
-            )):
+            if not bunsetu_may_end and (
+                (
+                    pos == "NOUN"
+                    and (prev_head == t.i or prev_head == head)
+                    and prev_dep in {"amod", "compound"}
+                )
+                or (
+                    pos == "PART"
+                    and (prev == head or prev_head == head)
+                    and dep == "mark"
+                )
+            ):
                phrase.append(t)
            else:
                bunsetu_may_end = True
--- a/spacy/lang/ja/syntax_iterators.py
+++ b/spacy/lang/ja/syntax_iterators.py
@ -1,24 +1,22 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...symbols import NOUN, PROPN, PRON, VERB

 # XXX this can probably be pruned a bit
 labels = [
-        "nsubj",
-        "nmod",
-        "dobj",
-        "nsubjpass",
-        "pcomp",
-        "pobj",
-        "obj",
-        "obl",
-        "dative",
-        "appos",
-        "attr",
-        "ROOT",
+    "nsubj",
+    "nmod",
+    "dobj",
+    "nsubjpass",
+    "pcomp",
+    "pobj",
+    "obj",
+    "obl",
+    "dative",
+    "appos",
+    "attr",
+    "ROOT",
 ]

+
 def noun_chunks(obj):
    """
    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
@ -52,4 +50,5 @@ def noun_chunks(obj):
                seen.update(w.i for w in word.head.rights)
            yield unseen[0], word.i + 1, np_label

+
 SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
--- a/spacy/lang/kn/examples.py
+++ b/spacy/lang/kn/examples.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/ml/init.py
+++ b/spacy/lang/ml/init.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from .stop_words import STOP_WORDS

 from ...language import Language
--- a/spacy/lang/ml/examples.py
+++ b/spacy/lang/ml/examples.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.

--- a/spacy/lang/ml/lex_attrs.py
+++ b/spacy/lang/ml/lex_attrs.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM


--- a/spacy/lang/ml/stop_words.py
+++ b/spacy/lang/ml/stop_words.py
@ -1,7 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 STOP_WORDS = set(
    """
 അത്
--- a/spacy/lang/pl/lemmatizer.py
+++ b/spacy/lang/pl/lemmatizer.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 from ...lemmatizer import Lemmatizer
 from ...parts_of_speech import NAMES

--- a/spacy/lang/sv/lex_attrs.py
+++ b/spacy/lang/sv/lex_attrs.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM


--- a/spacy/tests/lang/de/test_noun_chunks.py
+++ b/spacy/tests/lang/de/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/el/test_noun_chunks.py
+++ b/spacy/tests/lang/el/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/es/test_noun_chunks.py
+++ b/spacy/tests/lang/es/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/fa/test_noun_chunks.py
+++ b/spacy/tests/lang/fa/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/fr/test_noun_chunks.py
+++ b/spacy/tests/lang/fr/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/gu/test_text.py
+++ b/spacy/tests/lang/gu/test_text.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/hy/test_text.py
+++ b/spacy/tests/lang/hy/test_text.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest
 from spacy.lang.hy.lex_attrs import like_num

--- a/spacy/tests/lang/hy/test_tokenizer.py
+++ b/spacy/tests/lang/hy/test_tokenizer.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/id/test_noun_chunks.py
+++ b/spacy/tests/lang/id/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/ja/test_serialize.py
+++ b/spacy/tests/lang/ja/test_serialize.py
@ -1,7 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import pytest
 from spacy.lang.ja import Japanese
 from ...util import make_tempdir

--- a/spacy/tests/lang/ml/test_text.py
+++ b/spacy/tests/lang/ml/test_text.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/nb/test_noun_chunks.py
+++ b/spacy/tests/lang/nb/test_noun_chunks.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest


--- a/spacy/tests/lang/sv/test_lex_attrs.py
+++ b/spacy/tests/lang/sv/test_lex_attrs.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 from spacy.lang.sv.lex_attrs import like_num

--- a/spacy/tests/lang/zh/test_serialize.py
+++ b/spacy/tests/lang/zh/test_serialize.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 import pytest
 from spacy.lang.zh import Chinese
 from ...util import make_tempdir
--- a/spacy/tests/regression/test_issue5152.py
+++ b/spacy/tests/regression/test_issue5152.py
@ -1,6 +1,3 @@
-# coding: utf8
-from __future__ import unicode_literals
-
 from spacy.lang.en import English


--- a/spacy/tests/regression/test_issue5230.py
+++ b/spacy/tests/regression/test_issue5230.py
@ -1,4 +1,3 @@
-# coding: utf8
 import warnings
 from unittest import TestCase
 import pytest
--- a/spacy/tests/regression/test_issue5458.py
+++ b/spacy/tests/regression/test_issue5458.py
@ -1,6 +1,3 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
 from spacy.lang.en import English
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.tests.util import get_doc