Remove unicode declarations and tidy up

This commit is contained in:
Ines Montani 2020-06-21 22:34:10 +02:00
parent 689600e17d
commit 40bb918a4c
35 changed files with 76 additions and 147 deletions

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set(
"""
એમ

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.hy.examples import sentences

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set(
"""
նա

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ

View File

@ -1,21 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
POS_PHRASE_MAP = {
"NOUN": "NP",
"NUM": "NP",
"PRON": "NP",
"PROPN": "NP",
"VERB": "VP",
"ADJ": "ADJP",
"ADV": "ADVP",
"CCONJ": "CCONJP",
}
@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
dep = t.dep_
head = t.head.i
if debug:
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
print(
t.i,
t.orth_,
pos,
pos_type,
dep,
head,
bunsetu_may_end,
phrase_type,
phrase,
bunsetu,
)
# DET is always an individual bunsetu
if pos == "DET":
@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):
# entering new bunsetu
elif pos_type and (
pos_type != phrase_type or # different phrase type arises
bunsetu_may_end # same phrase type but bunsetu already ended
pos_type != phrase_type
or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended
):
# exceptional case: NOUN to VERB
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
if (
phrase_type == "NP"
and pos_type == "VP"
and prev_dep == "compound"
and prev_head == t.i
):
bunsetu.append(t)
phrase_type = "VP"
phrase.append(t)
# exceptional case: VERB to NOUN
elif phrase_type == "VP" and pos_type == "NP" and (
prev_dep == 'compound' and prev_head == t.i or
dep == 'compound' and prev == head or
prev_dep == 'nmod' and prev_head == t.i
elif (
phrase_type == "VP"
and pos_type == "NP"
and (
prev_dep == "compound"
and prev_head == t.i
or dep == "compound"
and prev == head
or prev_dep == "nmod"
and prev_head == t.i
)
):
bunsetu.append(t)
phrase_type = "NP"
@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
# NOUN bunsetu
elif phrase_type == "NP":
bunsetu.append(t)
if not bunsetu_may_end and ((
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
) or (
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
)):
if not bunsetu_may_end and (
(
(pos_type == "NP" or pos == "SYM")
and (prev_head == t.i or prev_head == head)
and prev_dep in {"compound", "nummod"}
)
or (
pos == "PART"
and (prev == head or prev_head == head)
and dep == "mark"
)
):
phrase.append(t)
else:
bunsetu_may_end = True
@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
# VERB bunsetu
elif phrase_type == "VP":
bunsetu.append(t)
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
if (
not bunsetu_may_end
and pos == "VERB"
and prev_head == t.i
and prev_dep == "compound"
):
phrase.append(t)
else:
bunsetu_may_end = True
# ADJ bunsetu
elif phrase_type == "ADJP" and tag != '連体詞':
elif phrase_type == "ADJP" and tag != "連体詞":
bunsetu.append(t)
if not bunsetu_may_end and ((
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
) or (
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
)):
if not bunsetu_may_end and (
(
pos == "NOUN"
and (prev_head == t.i or prev_head == head)
and prev_dep in {"amod", "compound"}
)
or (
pos == "PART"
and (prev == head or prev_head == head)
and dep == "mark"
)
):
phrase.append(t)
else:
bunsetu_may_end = True

View File

@ -1,24 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import NOUN, PROPN, PRON, VERB
# XXX this can probably be pruned a bit
labels = [
"nsubj",
"nmod",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"obj",
"obl",
"dative",
"appos",
"attr",
"ROOT",
"nsubj",
"nmod",
"dobj",
"nsubjpass",
"pcomp",
"pobj",
"obj",
"obl",
"dative",
"appos",
"attr",
"ROOT",
]
def noun_chunks(obj):
"""
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
@ -52,4 +50,5 @@ def noun_chunks(obj):
seen.update(w.i for w in word.head.rights)
yield unseen[0], word.i + 1, np_label
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from .stop_words import STOP_WORDS
from ...language import Language

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM

View File

@ -1,7 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set(
"""
അത

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from ...lemmatizer import Lemmatizer
from ...parts_of_speech import NAMES

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.lang.hy.lex_attrs import like_num

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,7 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.ja import Japanese
from ...util import make_tempdir

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.sv.lex_attrs import like_num

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.zh import Chinese
from ...util import make_tempdir

View File

@ -1,6 +1,3 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English

View File

@ -1,4 +1,3 @@
# coding: utf8
import warnings
from unittest import TestCase
import pytest

View File

@ -1,6 +1,3 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.lang.en.syntax_iterators import noun_chunks
from spacy.tests.util import get_doc