Remove unicode declarations and tidy up

This commit is contained in:
Ines Montani 2020-06-21 22:34:10 +02:00
parent 689600e17d
commit 40bb918a4c
35 changed files with 76 additions and 147 deletions

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set( STOP_WORDS = set(
""" """
એમ એમ

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.
>>> from spacy.lang.hy.examples import sentences >>> from spacy.lang.hy.examples import sentences

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set( STOP_WORDS = set(
""" """
նա նա

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ

View File

@ -1,21 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
POS_PHRASE_MAP = { POS_PHRASE_MAP = {
"NOUN": "NP", "NOUN": "NP",
"NUM": "NP", "NUM": "NP",
"PRON": "NP", "PRON": "NP",
"PROPN": "NP", "PROPN": "NP",
"VERB": "VP", "VERB": "VP",
"ADJ": "ADJP", "ADJ": "ADJP",
"ADV": "ADVP", "ADV": "ADVP",
"CCONJ": "CCONJP", "CCONJ": "CCONJP",
} }
@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
dep = t.dep_ dep = t.dep_
head = t.head.i head = t.head.i
if debug: if debug:
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) print(
t.i,
t.orth_,
pos,
pos_type,
dep,
head,
bunsetu_may_end,
phrase_type,
phrase,
bunsetu,
)
# DET is always an individual bunsetu # DET is always an individual bunsetu
if pos == "DET": if pos == "DET":
@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):
# entering new bunsetu # entering new bunsetu
elif pos_type and ( elif pos_type and (
pos_type != phrase_type or # different phrase type arises pos_type != phrase_type
bunsetu_may_end # same phrase type but bunsetu already ended or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended
): ):
# exceptional case: NOUN to VERB # exceptional case: NOUN to VERB
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: if (
phrase_type == "NP"
and pos_type == "VP"
and prev_dep == "compound"
and prev_head == t.i
):
bunsetu.append(t) bunsetu.append(t)
phrase_type = "VP" phrase_type = "VP"
phrase.append(t) phrase.append(t)
# exceptional case: VERB to NOUN # exceptional case: VERB to NOUN
elif phrase_type == "VP" and pos_type == "NP" and ( elif (
prev_dep == 'compound' and prev_head == t.i or phrase_type == "VP"
dep == 'compound' and prev == head or and pos_type == "NP"
prev_dep == 'nmod' and prev_head == t.i and (
prev_dep == "compound"
and prev_head == t.i
or dep == "compound"
and prev == head
or prev_dep == "nmod"
and prev_head == t.i
)
): ):
bunsetu.append(t) bunsetu.append(t)
phrase_type = "NP" phrase_type = "NP"
@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
# NOUN bunsetu # NOUN bunsetu
elif phrase_type == "NP": elif phrase_type == "NP":
bunsetu.append(t) bunsetu.append(t)
if not bunsetu_may_end and (( if not bunsetu_may_end and (
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} (
) or ( (pos_type == "NP" or pos == "SYM")
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' and (prev_head == t.i or prev_head == head)
)): and prev_dep in {"compound", "nummod"}
)
or (
pos == "PART"
and (prev == head or prev_head == head)
and dep == "mark"
)
):
phrase.append(t) phrase.append(t)
else: else:
bunsetu_may_end = True bunsetu_may_end = True
@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
# VERB bunsetu # VERB bunsetu
elif phrase_type == "VP": elif phrase_type == "VP":
bunsetu.append(t) bunsetu.append(t)
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': if (
not bunsetu_may_end
and pos == "VERB"
and prev_head == t.i
and prev_dep == "compound"
):
phrase.append(t) phrase.append(t)
else: else:
bunsetu_may_end = True bunsetu_may_end = True
# ADJ bunsetu # ADJ bunsetu
elif phrase_type == "ADJP" and tag != '連体詞': elif phrase_type == "ADJP" and tag != "連体詞":
bunsetu.append(t) bunsetu.append(t)
if not bunsetu_may_end and (( if not bunsetu_may_end and (
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} (
) or ( pos == "NOUN"
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' and (prev_head == t.i or prev_head == head)
)): and prev_dep in {"amod", "compound"}
)
or (
pos == "PART"
and (prev == head or prev_head == head)
and dep == "mark"
)
):
phrase.append(t) phrase.append(t)
else: else:
bunsetu_may_end = True bunsetu_may_end = True

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON, VERB from ...symbols import NOUN, PROPN, PRON, VERB
# XXX this can probably be pruned a bit # XXX this can probably be pruned a bit
@ -19,6 +16,7 @@ labels = [
"ROOT", "ROOT",
] ]
def noun_chunks(obj): def noun_chunks(obj):
""" """
Detect base noun phrases from a dependency parse. Works on both Doc and Span. Detect base noun phrases from a dependency parse. Works on both Doc and Span.
@ -52,4 +50,5 @@ def noun_chunks(obj):
seen.update(w.i for w in word.head.rights) seen.update(w.i for w in word.head.rights)
yield unseen[0], word.i + 1, np_label yield unseen[0], word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from ...language import Language from ...language import Language

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
""" """
Example sentences to test spaCy and its language models. Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set( STOP_WORDS = set(
""" """
അത അത

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer from ...lemmatizer import Lemmatizer
from ...parts_of_speech import NAMES from ...parts_of_speech import NAMES

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
import pytest import pytest
from spacy.lang.hy.lex_attrs import like_num from spacy.lang.hy.lex_attrs import like_num

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.ja import Japanese from spacy.lang.ja import Japanese
from ...util import make_tempdir from ...util import make_tempdir

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest
from spacy.lang.sv.lex_attrs import like_num from spacy.lang.sv.lex_attrs import like_num

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest import pytest
from spacy.lang.zh import Chinese from spacy.lang.zh import Chinese
from ...util import make_tempdir from ...util import make_tempdir

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English

View File

@ -1,4 +1,3 @@
# coding: utf8
import warnings import warnings
from unittest import TestCase from unittest import TestCase
import pytest import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.syntax_iterators import noun_chunks from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.tests.util import get_doc from spacy.tests.util import get_doc