mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
Remove unicode declarations and tidy up
This commit is contained in:
parent
689600e17d
commit
40bb918a4c
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
|
||||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
એમ
|
એમ
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
>>> from spacy.lang.hy.examples import sentences
|
>>> from spacy.lang.hy.examples import sentences
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
նա
|
նա
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
|
||||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
|
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,11 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
|
||||||
|
|
||||||
|
|
||||||
POS_PHRASE_MAP = {
|
POS_PHRASE_MAP = {
|
||||||
"NOUN": "NP",
|
"NOUN": "NP",
|
||||||
"NUM": "NP",
|
"NUM": "NP",
|
||||||
"PRON": "NP",
|
"PRON": "NP",
|
||||||
"PROPN": "NP",
|
"PROPN": "NP",
|
||||||
|
|
||||||
"VERB": "VP",
|
"VERB": "VP",
|
||||||
|
|
||||||
"ADJ": "ADJP",
|
"ADJ": "ADJP",
|
||||||
|
|
||||||
"ADV": "ADVP",
|
"ADV": "ADVP",
|
||||||
|
|
||||||
"CCONJ": "CCONJP",
|
"CCONJ": "CCONJP",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,7 +27,18 @@ def yield_bunsetu(doc, debug=False):
|
||||||
dep = t.dep_
|
dep = t.dep_
|
||||||
head = t.head.i
|
head = t.head.i
|
||||||
if debug:
|
if debug:
|
||||||
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
|
print(
|
||||||
|
t.i,
|
||||||
|
t.orth_,
|
||||||
|
pos,
|
||||||
|
pos_type,
|
||||||
|
dep,
|
||||||
|
head,
|
||||||
|
bunsetu_may_end,
|
||||||
|
phrase_type,
|
||||||
|
phrase,
|
||||||
|
bunsetu,
|
||||||
|
)
|
||||||
|
|
||||||
# DET is always an individual bunsetu
|
# DET is always an individual bunsetu
|
||||||
if pos == "DET":
|
if pos == "DET":
|
||||||
|
@ -75,19 +76,31 @@ def yield_bunsetu(doc, debug=False):
|
||||||
|
|
||||||
# entering new bunsetu
|
# entering new bunsetu
|
||||||
elif pos_type and (
|
elif pos_type and (
|
||||||
pos_type != phrase_type or # different phrase type arises
|
pos_type != phrase_type
|
||||||
bunsetu_may_end # same phrase type but bunsetu already ended
|
or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended
|
||||||
):
|
):
|
||||||
# exceptional case: NOUN to VERB
|
# exceptional case: NOUN to VERB
|
||||||
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
|
if (
|
||||||
|
phrase_type == "NP"
|
||||||
|
and pos_type == "VP"
|
||||||
|
and prev_dep == "compound"
|
||||||
|
and prev_head == t.i
|
||||||
|
):
|
||||||
bunsetu.append(t)
|
bunsetu.append(t)
|
||||||
phrase_type = "VP"
|
phrase_type = "VP"
|
||||||
phrase.append(t)
|
phrase.append(t)
|
||||||
# exceptional case: VERB to NOUN
|
# exceptional case: VERB to NOUN
|
||||||
elif phrase_type == "VP" and pos_type == "NP" and (
|
elif (
|
||||||
prev_dep == 'compound' and prev_head == t.i or
|
phrase_type == "VP"
|
||||||
dep == 'compound' and prev == head or
|
and pos_type == "NP"
|
||||||
prev_dep == 'nmod' and prev_head == t.i
|
and (
|
||||||
|
prev_dep == "compound"
|
||||||
|
and prev_head == t.i
|
||||||
|
or dep == "compound"
|
||||||
|
and prev == head
|
||||||
|
or prev_dep == "nmod"
|
||||||
|
and prev_head == t.i
|
||||||
|
)
|
||||||
):
|
):
|
||||||
bunsetu.append(t)
|
bunsetu.append(t)
|
||||||
phrase_type = "NP"
|
phrase_type = "NP"
|
||||||
|
@ -102,11 +115,18 @@ def yield_bunsetu(doc, debug=False):
|
||||||
# NOUN bunsetu
|
# NOUN bunsetu
|
||||||
elif phrase_type == "NP":
|
elif phrase_type == "NP":
|
||||||
bunsetu.append(t)
|
bunsetu.append(t)
|
||||||
if not bunsetu_may_end and ((
|
if not bunsetu_may_end and (
|
||||||
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
|
(
|
||||||
) or (
|
(pos_type == "NP" or pos == "SYM")
|
||||||
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
and (prev_head == t.i or prev_head == head)
|
||||||
)):
|
and prev_dep in {"compound", "nummod"}
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
pos == "PART"
|
||||||
|
and (prev == head or prev_head == head)
|
||||||
|
and dep == "mark"
|
||||||
|
)
|
||||||
|
):
|
||||||
phrase.append(t)
|
phrase.append(t)
|
||||||
else:
|
else:
|
||||||
bunsetu_may_end = True
|
bunsetu_may_end = True
|
||||||
|
@ -114,19 +134,31 @@ def yield_bunsetu(doc, debug=False):
|
||||||
# VERB bunsetu
|
# VERB bunsetu
|
||||||
elif phrase_type == "VP":
|
elif phrase_type == "VP":
|
||||||
bunsetu.append(t)
|
bunsetu.append(t)
|
||||||
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
|
if (
|
||||||
|
not bunsetu_may_end
|
||||||
|
and pos == "VERB"
|
||||||
|
and prev_head == t.i
|
||||||
|
and prev_dep == "compound"
|
||||||
|
):
|
||||||
phrase.append(t)
|
phrase.append(t)
|
||||||
else:
|
else:
|
||||||
bunsetu_may_end = True
|
bunsetu_may_end = True
|
||||||
|
|
||||||
# ADJ bunsetu
|
# ADJ bunsetu
|
||||||
elif phrase_type == "ADJP" and tag != '連体詞':
|
elif phrase_type == "ADJP" and tag != "連体詞":
|
||||||
bunsetu.append(t)
|
bunsetu.append(t)
|
||||||
if not bunsetu_may_end and ((
|
if not bunsetu_may_end and (
|
||||||
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
|
(
|
||||||
) or (
|
pos == "NOUN"
|
||||||
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
and (prev_head == t.i or prev_head == head)
|
||||||
)):
|
and prev_dep in {"amod", "compound"}
|
||||||
|
)
|
||||||
|
or (
|
||||||
|
pos == "PART"
|
||||||
|
and (prev == head or prev_head == head)
|
||||||
|
and dep == "mark"
|
||||||
|
)
|
||||||
|
):
|
||||||
phrase.append(t)
|
phrase.append(t)
|
||||||
else:
|
else:
|
||||||
bunsetu_may_end = True
|
bunsetu_may_end = True
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...symbols import NOUN, PROPN, PRON, VERB
|
from ...symbols import NOUN, PROPN, PRON, VERB
|
||||||
|
|
||||||
# XXX this can probably be pruned a bit
|
# XXX this can probably be pruned a bit
|
||||||
|
@ -19,6 +16,7 @@ labels = [
|
||||||
"ROOT",
|
"ROOT",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def noun_chunks(obj):
|
def noun_chunks(obj):
|
||||||
"""
|
"""
|
||||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||||
|
@ -52,4 +50,5 @@ def noun_chunks(obj):
|
||||||
seen.update(w.i for w in word.head.rights)
|
seen.update(w.i for w in word.head.rights)
|
||||||
yield unseen[0], word.i + 1, np_label
|
yield unseen[0], word.i + 1, np_label
|
||||||
|
|
||||||
|
|
||||||
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Example sentences to test spaCy and its language models.
|
Example sentences to test spaCy and its language models.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
|
|
||||||
STOP_WORDS = set(
|
STOP_WORDS = set(
|
||||||
"""
|
"""
|
||||||
അത്
|
അത്
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...lemmatizer import Lemmatizer
|
from ...lemmatizer import Lemmatizer
|
||||||
from ...parts_of_speech import NAMES
|
from ...parts_of_speech import NAMES
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.hy.lex_attrs import like_num
|
from spacy.lang.hy.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.lang.ja import Japanese
|
from spacy.lang.ja import Japanese
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.sv.lex_attrs import like_num
|
from spacy.lang.sv.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.zh import Chinese
|
from spacy.lang.zh import Chinese
|
||||||
from ...util import make_tempdir
|
from ...util import make_tempdir
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
# coding: utf8
|
|
||||||
import warnings
|
import warnings
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
import pytest
|
import pytest
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.syntax_iterators import noun_chunks
|
from spacy.lang.en.syntax_iterators import noun_chunks
|
||||||
from spacy.tests.util import get_doc
|
from spacy.tests.util import get_doc
|
||||||
|
|
Loading…
Reference in New Issue
Block a user