mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-22 07:14:08 +03:00
145 lines
4.4 KiB
Python
145 lines
4.4 KiB
Python
|
# coding: utf8
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
from .stop_words import STOP_WORDS
|
||
|
|
||
|
|
||
|
POS_PHRASE_MAP = {
|
||
|
"NOUN": "NP",
|
||
|
"NUM": "NP",
|
||
|
"PRON": "NP",
|
||
|
"PROPN": "NP",
|
||
|
|
||
|
"VERB": "VP",
|
||
|
|
||
|
"ADJ": "ADJP",
|
||
|
|
||
|
"ADV": "ADVP",
|
||
|
|
||
|
"CCONJ": "CCONJP",
|
||
|
}
|
||
|
|
||
|
|
||
|
# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
|
||
|
def yield_bunsetu(doc, debug=False):
|
||
|
bunsetu = []
|
||
|
bunsetu_may_end = False
|
||
|
phrase_type = None
|
||
|
phrase = None
|
||
|
prev = None
|
||
|
prev_tag = None
|
||
|
prev_dep = None
|
||
|
prev_head = None
|
||
|
for t in doc:
|
||
|
pos = t.pos_
|
||
|
pos_type = POS_PHRASE_MAP.get(pos, None)
|
||
|
tag = t.tag_
|
||
|
dep = t.dep_
|
||
|
head = t.head.i
|
||
|
if debug:
|
||
|
print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu)
|
||
|
|
||
|
# DET is always an individual bunsetu
|
||
|
if pos == "DET":
|
||
|
if bunsetu:
|
||
|
yield bunsetu, phrase_type, phrase
|
||
|
yield [t], None, None
|
||
|
bunsetu = []
|
||
|
bunsetu_may_end = False
|
||
|
phrase_type = None
|
||
|
phrase = None
|
||
|
|
||
|
# PRON or Open PUNCT always splits bunsetu
|
||
|
elif tag == "補助記号-括弧開":
|
||
|
if bunsetu:
|
||
|
yield bunsetu, phrase_type, phrase
|
||
|
bunsetu = [t]
|
||
|
bunsetu_may_end = True
|
||
|
phrase_type = None
|
||
|
phrase = None
|
||
|
|
||
|
# bunsetu head not appeared
|
||
|
elif phrase_type is None:
|
||
|
if bunsetu and prev_tag == "補助記号-読点":
|
||
|
yield bunsetu, phrase_type, phrase
|
||
|
bunsetu = []
|
||
|
bunsetu_may_end = False
|
||
|
phrase_type = None
|
||
|
phrase = None
|
||
|
bunsetu.append(t)
|
||
|
if pos_type: # begin phrase
|
||
|
phrase = [t]
|
||
|
phrase_type = pos_type
|
||
|
if pos_type in {"ADVP", "CCONJP"}:
|
||
|
bunsetu_may_end = True
|
||
|
|
||
|
# entering new bunsetu
|
||
|
elif pos_type and (
|
||
|
pos_type != phrase_type or # different phrase type arises
|
||
|
bunsetu_may_end # same phrase type but bunsetu already ended
|
||
|
):
|
||
|
# exceptional case: NOUN to VERB
|
||
|
if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i:
|
||
|
bunsetu.append(t)
|
||
|
phrase_type = "VP"
|
||
|
phrase.append(t)
|
||
|
# exceptional case: VERB to NOUN
|
||
|
elif phrase_type == "VP" and pos_type == "NP" and (
|
||
|
prev_dep == 'compound' and prev_head == t.i or
|
||
|
dep == 'compound' and prev == head or
|
||
|
prev_dep == 'nmod' and prev_head == t.i
|
||
|
):
|
||
|
bunsetu.append(t)
|
||
|
phrase_type = "NP"
|
||
|
phrase.append(t)
|
||
|
else:
|
||
|
yield bunsetu, phrase_type, phrase
|
||
|
bunsetu = [t]
|
||
|
bunsetu_may_end = False
|
||
|
phrase_type = pos_type
|
||
|
phrase = [t]
|
||
|
|
||
|
# NOUN bunsetu
|
||
|
elif phrase_type == "NP":
|
||
|
bunsetu.append(t)
|
||
|
if not bunsetu_may_end and ((
|
||
|
(pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'}
|
||
|
) or (
|
||
|
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
||
|
)):
|
||
|
phrase.append(t)
|
||
|
else:
|
||
|
bunsetu_may_end = True
|
||
|
|
||
|
# VERB bunsetu
|
||
|
elif phrase_type == "VP":
|
||
|
bunsetu.append(t)
|
||
|
if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound':
|
||
|
phrase.append(t)
|
||
|
else:
|
||
|
bunsetu_may_end = True
|
||
|
|
||
|
# ADJ bunsetu
|
||
|
elif phrase_type == "ADJP" and tag != '連体詞':
|
||
|
bunsetu.append(t)
|
||
|
if not bunsetu_may_end and ((
|
||
|
pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'}
|
||
|
) or (
|
||
|
pos == "PART" and (prev == head or prev_head == head) and dep == 'mark'
|
||
|
)):
|
||
|
phrase.append(t)
|
||
|
else:
|
||
|
bunsetu_may_end = True
|
||
|
|
||
|
# other bunsetu
|
||
|
else:
|
||
|
bunsetu.append(t)
|
||
|
|
||
|
prev = t.i
|
||
|
prev_tag = t.tag_
|
||
|
prev_dep = t.dep_
|
||
|
prev_head = head
|
||
|
|
||
|
if bunsetu:
|
||
|
yield bunsetu, phrase_type, phrase
|