mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 18:36:36 +03:00
177 lines
5.1 KiB
Python
177 lines
5.1 KiB
Python
POS_PHRASE_MAP = {
|
|
"NOUN": "NP",
|
|
"NUM": "NP",
|
|
"PRON": "NP",
|
|
"PROPN": "NP",
|
|
"VERB": "VP",
|
|
"ADJ": "ADJP",
|
|
"ADV": "ADVP",
|
|
"CCONJ": "CCONJP",
|
|
}
|
|
|
|
|
|
# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
|
|
def yield_bunsetu(doc, debug=False):
|
|
bunsetu = []
|
|
bunsetu_may_end = False
|
|
phrase_type = None
|
|
phrase = None
|
|
prev = None
|
|
prev_tag = None
|
|
prev_dep = None
|
|
prev_head = None
|
|
for t in doc:
|
|
pos = t.pos_
|
|
pos_type = POS_PHRASE_MAP.get(pos, None)
|
|
tag = t.tag_
|
|
dep = t.dep_
|
|
head = t.head.i
|
|
if debug:
|
|
print(
|
|
t.i,
|
|
t.orth_,
|
|
pos,
|
|
pos_type,
|
|
dep,
|
|
head,
|
|
bunsetu_may_end,
|
|
phrase_type,
|
|
phrase,
|
|
bunsetu,
|
|
)
|
|
|
|
# DET is always an individual bunsetu
|
|
if pos == "DET":
|
|
if bunsetu:
|
|
yield bunsetu, phrase_type, phrase
|
|
yield [t], None, None
|
|
bunsetu = []
|
|
bunsetu_may_end = False
|
|
phrase_type = None
|
|
phrase = None
|
|
|
|
# PRON or Open PUNCT always splits bunsetu
|
|
elif tag == "補助記号-括弧開":
|
|
if bunsetu:
|
|
yield bunsetu, phrase_type, phrase
|
|
bunsetu = [t]
|
|
bunsetu_may_end = True
|
|
phrase_type = None
|
|
phrase = None
|
|
|
|
# bunsetu head not appeared
|
|
elif phrase_type is None:
|
|
if bunsetu and prev_tag == "補助記号-読点":
|
|
yield bunsetu, phrase_type, phrase
|
|
bunsetu = []
|
|
bunsetu_may_end = False
|
|
phrase_type = None
|
|
phrase = None
|
|
bunsetu.append(t)
|
|
if pos_type: # begin phrase
|
|
phrase = [t]
|
|
phrase_type = pos_type
|
|
if pos_type in {"ADVP", "CCONJP"}:
|
|
bunsetu_may_end = True
|
|
|
|
# entering new bunsetu
|
|
elif pos_type and (
|
|
pos_type != phrase_type
|
|
or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended
|
|
):
|
|
# exceptional case: NOUN to VERB
|
|
if (
|
|
phrase_type == "NP"
|
|
and pos_type == "VP"
|
|
and prev_dep == "compound"
|
|
and prev_head == t.i
|
|
):
|
|
bunsetu.append(t)
|
|
phrase_type = "VP"
|
|
phrase.append(t)
|
|
# exceptional case: VERB to NOUN
|
|
elif (
|
|
phrase_type == "VP"
|
|
and pos_type == "NP"
|
|
and (
|
|
prev_dep == "compound"
|
|
and prev_head == t.i
|
|
or dep == "compound"
|
|
and prev == head
|
|
or prev_dep == "nmod"
|
|
and prev_head == t.i
|
|
)
|
|
):
|
|
bunsetu.append(t)
|
|
phrase_type = "NP"
|
|
phrase.append(t)
|
|
else:
|
|
yield bunsetu, phrase_type, phrase
|
|
bunsetu = [t]
|
|
bunsetu_may_end = False
|
|
phrase_type = pos_type
|
|
phrase = [t]
|
|
|
|
# NOUN bunsetu
|
|
elif phrase_type == "NP":
|
|
bunsetu.append(t)
|
|
if not bunsetu_may_end and (
|
|
(
|
|
(pos_type == "NP" or pos == "SYM")
|
|
and (prev_head == t.i or prev_head == head)
|
|
and prev_dep in {"compound", "nummod"}
|
|
)
|
|
or (
|
|
pos == "PART"
|
|
and (prev == head or prev_head == head)
|
|
and dep == "mark"
|
|
)
|
|
):
|
|
phrase.append(t)
|
|
else:
|
|
bunsetu_may_end = True
|
|
|
|
# VERB bunsetu
|
|
elif phrase_type == "VP":
|
|
bunsetu.append(t)
|
|
if (
|
|
not bunsetu_may_end
|
|
and pos == "VERB"
|
|
and prev_head == t.i
|
|
and prev_dep == "compound"
|
|
):
|
|
phrase.append(t)
|
|
else:
|
|
bunsetu_may_end = True
|
|
|
|
# ADJ bunsetu
|
|
elif phrase_type == "ADJP" and tag != "連体詞":
|
|
bunsetu.append(t)
|
|
if not bunsetu_may_end and (
|
|
(
|
|
pos == "NOUN"
|
|
and (prev_head == t.i or prev_head == head)
|
|
and prev_dep in {"amod", "compound"}
|
|
)
|
|
or (
|
|
pos == "PART"
|
|
and (prev == head or prev_head == head)
|
|
and dep == "mark"
|
|
)
|
|
):
|
|
phrase.append(t)
|
|
else:
|
|
bunsetu_may_end = True
|
|
|
|
# other bunsetu
|
|
else:
|
|
bunsetu.append(t)
|
|
|
|
prev = t.i
|
|
prev_tag = t.tag_
|
|
prev_dep = t.dep_
|
|
prev_head = head
|
|
|
|
if bunsetu:
|
|
yield bunsetu, phrase_type, phrase
|