mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Remove unicode declarations and tidy up
This commit is contained in:
parent
689600e17d
commit
40bb918a4c
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ...language import Language
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
એમ
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.hy.examples import sentences
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
նա
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
|
||||
|
||||
|
|
|
@ -1,21 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
|
||||
POS_PHRASE_MAP = {
|
||||
"NOUN": "NP",
|
||||
"NUM": "NP",
|
||||
"PRON": "NP",
|
||||
"PROPN": "NP",
|
||||
|
||||
"VERB": "VP",
|
||||
|
||||
"ADJ": "ADJP",
|
||||
|
||||
"ADV": "ADVP",
|
||||
|
||||
"CCONJ": "CCONJP",
|
||||
}
|
||||
|
||||
|
@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
|
|||
dep = t.dep_
|
||||
head = t.head.i
|
||||
if debug:
|
||||
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
|
||||
print(
|
||||
t.i,
|
||||
t.orth_,
|
||||
pos,
|
||||
pos_type,
|
||||
dep,
|
||||
head,
|
||||
bunsetu_may_end,
|
||||
phrase_type,
|
||||
phrase,
|
||||
bunsetu,
|
||||
)
|
||||
|
||||
# DET is always an individual bunsetu
|
||||
if pos == "DET":
|
||||
|
@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):
|
|||
|
||||
# entering new bunsetu
|
||||
elif pos_type and (
|
||||
pos_type != phrase_type or # different phrase type arises
|
||||
bunsetu_may_end # same phrase type but bunsetu already ended
|
||||
pos_type != phrase_type
|
||||
or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended
|
||||
):
|
||||
# exceptional case: NOUN to VERB
|
||||
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
|
||||
if (
|
||||
phrase_type == "NP"
|
||||
and pos_type == "VP"
|
||||
and prev_dep == "compound"
|
||||
and prev_head == t.i
|
||||
):
|
||||
bunsetu.append(t)
|
||||
phrase_type = "VP"
|
||||
phrase.append(t)
|
||||
# exceptional case: VERB to NOUN
|
||||
elif phrase_type == "VP" and pos_type == "NP" and (
|
||||
prev_dep == 'compound' and prev_head == t.i or
|
||||
dep == 'compound' and prev == head or
|
||||
prev_dep == 'nmod' and prev_head == t.i
|
||||
elif (
|
||||
phrase_type == "VP"
|
||||
and pos_type == "NP"
|
||||
and (
|
||||
prev_dep == "compound"
|
||||
and prev_head == t.i
|
||||
or dep == "compound"
|
||||
and prev == head
|
||||
or prev_dep == "nmod"
|
||||
and prev_head == t.i
|
||||
)
|
||||
):
|
||||
bunsetu.append(t)
|
||||
phrase_type = "NP"
|
||||
|
@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
|
|||
# NOUN bunsetu
|
||||
elif phrase_type == "NP":
|
||||
bunsetu.append(t)
|
||||
if not bunsetu_may_end and ((
|
||||
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
|
||||
) or (
|
||||
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
||||
)):
|
||||
if not bunsetu_may_end and (
|
||||
(
|
||||
(pos_type == "NP" or pos == "SYM")
|
||||
and (prev_head == t.i or prev_head == head)
|
||||
and prev_dep in {"compound", "nummod"}
|
||||
)
|
||||
or (
|
||||
pos == "PART"
|
||||
and (prev == head or prev_head == head)
|
||||
and dep == "mark"
|
||||
)
|
||||
):
|
||||
phrase.append(t)
|
||||
else:
|
||||
bunsetu_may_end = True
|
||||
|
@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
|
|||
# VERB bunsetu
|
||||
elif phrase_type == "VP":
|
||||
bunsetu.append(t)
|
||||
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
|
||||
if (
|
||||
not bunsetu_may_end
|
||||
and pos == "VERB"
|
||||
and prev_head == t.i
|
||||
and prev_dep == "compound"
|
||||
):
|
||||
phrase.append(t)
|
||||
else:
|
||||
bunsetu_may_end = True
|
||||
|
||||
# ADJ bunsetu
|
||||
elif phrase_type == "ADJP" and tag != '連体詞':
|
||||
elif phrase_type == "ADJP" and tag != "連体詞":
|
||||
bunsetu.append(t)
|
||||
if not bunsetu_may_end and ((
|
||||
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
|
||||
) or (
|
||||
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
||||
)):
|
||||
if not bunsetu_may_end and (
|
||||
(
|
||||
pos == "NOUN"
|
||||
and (prev_head == t.i or prev_head == head)
|
||||
and prev_dep in {"amod", "compound"}
|
||||
)
|
||||
or (
|
||||
pos == "PART"
|
||||
and (prev == head or prev_head == head)
|
||||
and dep == "mark"
|
||||
)
|
||||
):
|
||||
phrase.append(t)
|
||||
else:
|
||||
bunsetu_may_end = True
|
||||
|
|
|
@ -1,24 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON, VERB
|
||||
|
||||
# XXX this can probably be pruned a bit
|
||||
labels = [
|
||||
"nsubj",
|
||||
"nmod",
|
||||
"dobj",
|
||||
"nsubjpass",
|
||||
"pcomp",
|
||||
"pobj",
|
||||
"obj",
|
||||
"obl",
|
||||
"dative",
|
||||
"appos",
|
||||
"attr",
|
||||
"ROOT",
|
||||
"nsubj",
|
||||
"nmod",
|
||||
"dobj",
|
||||
"nsubjpass",
|
||||
"pcomp",
|
||||
"pobj",
|
||||
"obj",
|
||||
"obl",
|
||||
"dative",
|
||||
"appos",
|
||||
"attr",
|
||||
"ROOT",
|
||||
]
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
|
@ -52,4 +50,5 @@ def noun_chunks(obj):
|
|||
seen.update(w.i for w in word.head.rights)
|
||||
yield unseen[0], word.i + 1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
|
||||
from ...language import Language
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
അത്
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...lemmatizer import Lemmatizer
|
||||
from ...parts_of_speech import NAMES
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.hy.lex_attrs import like_num
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.ja import Japanese
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.sv.lex_attrs import like_num
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.lang.zh import Chinese
|
||||
from ...util import make_tempdir
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
# coding: utf8
|
||||
import warnings
|
||||
from unittest import TestCase
|
||||
import pytest
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||
from spacy.tests.util import get_doc
|
||||
|
|
Loading…
Reference in New Issue
Block a user