2017-05-13 13:32:06 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
2016-12-30 20:19:18 +03:00
|
|
|
|
2017-05-13 13:32:37 +03:00
|
|
|
from .doc import Doc
|
|
|
|
from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
|
|
|
|
|
|
|
|
2016-12-30 20:19:18 +03:00
|
|
|
def merge_ents(doc):
|
2017-05-13 13:32:06 +03:00
|
|
|
"""
|
|
|
|
Helper: merge adjacent entities into single tokens; modifies the doc.
|
|
|
|
"""
|
2016-12-30 20:19:18 +03:00
|
|
|
for ent in doc.ents:
|
|
|
|
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
|
|
|
return doc
|
|
|
|
|
2017-05-13 13:32:06 +03:00
|
|
|
|
2016-12-30 20:19:18 +03:00
|
|
|
def format_POS(token, light, flat):
|
2017-05-13 13:32:06 +03:00
|
|
|
"""
|
|
|
|
Helper: form the POS output for a token.
|
|
|
|
"""
|
2016-12-30 20:19:18 +03:00
|
|
|
subtree = dict([
|
|
|
|
("word", token.text),
|
|
|
|
("lemma", token.lemma_), # trigger
|
|
|
|
("NE", token.ent_type_), # trigger
|
|
|
|
("POS_fine", token.tag_),
|
|
|
|
("POS_coarse", token.pos_),
|
|
|
|
("arc", token.dep_),
|
|
|
|
("modifiers", [])
|
|
|
|
])
|
|
|
|
if light:
|
|
|
|
subtree.pop("lemma")
|
|
|
|
subtree.pop("NE")
|
|
|
|
if flat:
|
|
|
|
subtree.pop("arc")
|
|
|
|
subtree.pop("modifiers")
|
|
|
|
return subtree
|
|
|
|
|
2017-05-13 13:32:06 +03:00
|
|
|
|
2017-05-13 13:32:23 +03:00
|
|
|
def POS_tree(root, light=False, flat=False):
|
2017-05-13 13:32:06 +03:00
|
|
|
"""
|
|
|
|
Helper: generate a POS tree for a root token. The doc must have
|
|
|
|
merge_ents(doc) ran on it.
|
|
|
|
"""
|
2016-12-30 20:19:18 +03:00
|
|
|
subtree = format_POS(root, light=light, flat=flat)
|
|
|
|
for c in root.children:
|
|
|
|
subtree["modifiers"].append(POS_tree(c))
|
|
|
|
return subtree
|
|
|
|
|
2017-05-13 13:32:06 +03:00
|
|
|
|
2016-12-30 20:19:18 +03:00
|
|
|
def parse_tree(doc, light=False, flat=False):
|
2017-05-13 13:32:06 +03:00
|
|
|
"""
|
|
|
|
Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
|
|
|
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
2016-12-30 20:19:18 +03:00
|
|
|
|
|
|
|
Args:
|
|
|
|
doc: The doc for parsing.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
[parse_trees (Dict)]:
|
|
|
|
|
|
|
|
>>> from spacy.en import English
|
|
|
|
>>> nlp = English()
|
|
|
|
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
|
|
|
>>> trees = doc.print_tree()
|
|
|
|
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
|
|
|
"""
|
2017-05-13 13:32:37 +03:00
|
|
|
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
|
|
|
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
|
|
|
doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
|
2016-12-30 20:19:18 +03:00
|
|
|
merge_ents(doc_clone) # merge the entities into single tokens first
|
|
|
|
return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
|