POS_PHRASE_MAP = { "NOUN": "NP", "NUM": "NP", "PRON": "NP", "PROPN": "NP", "VERB": "VP", "ADJ": "ADJP", "ADV": "ADVP", "CCONJ": "CCONJP", } # return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] def yield_bunsetu(doc, debug=False): bunsetu = [] bunsetu_may_end = False phrase_type = None phrase = None prev = None prev_tag = None prev_dep = None prev_head = None for t in doc: pos = t.pos_ pos_type = POS_PHRASE_MAP.get(pos, None) tag = t.tag_ dep = t.dep_ head = t.head.i if debug: print( t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu, ) # DET is always an individual bunsetu if pos == "DET": if bunsetu: yield bunsetu, phrase_type, phrase yield [t], None, None bunsetu = [] bunsetu_may_end = False phrase_type = None phrase = None # PRON or Open PUNCT always splits bunsetu elif tag == "補助記号-括弧開": if bunsetu: yield bunsetu, phrase_type, phrase bunsetu = [t] bunsetu_may_end = True phrase_type = None phrase = None # bunsetu head not appeared elif phrase_type is None: if bunsetu and prev_tag == "補助記号-読点": yield bunsetu, phrase_type, phrase bunsetu = [] bunsetu_may_end = False phrase_type = None phrase = None bunsetu.append(t) if pos_type: # begin phrase phrase = [t] phrase_type = pos_type if pos_type in {"ADVP", "CCONJP"}: bunsetu_may_end = True # entering new bunsetu elif pos_type and ( pos_type != phrase_type or bunsetu_may_end # different phrase type arises # same phrase type but bunsetu already ended ): # exceptional case: NOUN to VERB if ( phrase_type == "NP" and pos_type == "VP" and prev_dep == "compound" and prev_head == t.i ): bunsetu.append(t) phrase_type = "VP" phrase.append(t) # exceptional case: VERB to NOUN elif ( phrase_type == "VP" and pos_type == "NP" and ( prev_dep == "compound" and prev_head == t.i or dep == "compound" and prev == head or prev_dep == "nmod" and prev_head == t.i ) ): bunsetu.append(t) phrase_type = "NP" phrase.append(t) else: yield bunsetu, phrase_type, phrase bunsetu = [t] bunsetu_may_end = False phrase_type = pos_type phrase = [t] # NOUN bunsetu elif phrase_type == "NP": bunsetu.append(t) if not bunsetu_may_end and ( ( (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {"compound", "nummod"} ) or ( pos == "PART" and (prev == head or prev_head == head) and dep == "mark" ) ): phrase.append(t) else: bunsetu_may_end = True # VERB bunsetu elif phrase_type == "VP": bunsetu.append(t) if ( not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == "compound" ): phrase.append(t) else: bunsetu_may_end = True # ADJ bunsetu elif phrase_type == "ADJP" and tag != "連体詞": bunsetu.append(t) if not bunsetu_may_end and ( ( pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {"amod", "compound"} ) or ( pos == "PART" and (prev == head or prev_head == head) and dep == "mark" ) ): phrase.append(t) else: bunsetu_may_end = True # other bunsetu else: bunsetu.append(t) prev = t.i prev_tag = t.tag_ prev_dep = t.dep_ prev_head = head if bunsetu: yield bunsetu, phrase_type, phrase